From c07bcfb13738b3c22d182b9350d70aeeee7148b9 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <wbm@jlselogin5.ftm.alcf.anl.gov>
Date: Sat, 24 Jul 2021 00:40:41 +0000
Subject: [PATCH 01/44] Add sycl configure option and memory functionality

This is an initial commit that still needs some reworking
and debugging.
---
 src/config/HYPRE_config.h.in       |   3 +
 src/config/Makefile.config.in      |  10 +-
 src/config/configure.in            | 117 ++++++++++++++++++++++++
 src/configure                      | 141 ++++++++++++++++++++++++++++-
 src/test/Makefile                  |   5 +
 src/utilities/Makefile             |   5 +-
 src/utilities/_hypre_utilities.h   |  10 ++
 src/utilities/_hypre_utilities.hpp |  75 +++++++++++++++
 src/utilities/general.c            | 112 ++++++++++++++++++++++-
 src/utilities/handle.h             |  10 ++
 src/utilities/headers              |   1 +
 src/utilities/memory.c             |  95 ++++++++++++++++++-
 12 files changed, 574 insertions(+), 10 deletions(-)

diff --git a/src/config/HYPRE_config.h.in b/src/config/HYPRE_config.h.in
index 98425a6545..bcbeef6853 100644
--- a/src/config/HYPRE_config.h.in
+++ b/src/config/HYPRE_config.h.in
@@ -187,6 +187,9 @@
 /* HIP being used */
 #undef HYPRE_USING_HIP
 
+/* SYCL being used */
+#undef HYPRE_USING_SYCL
+
 /* Define to 1 if using host memory only */
 #undef HYPRE_USING_HOST_MEMORY
 
diff --git a/src/config/Makefile.config.in b/src/config/Makefile.config.in
index a99223c346..cf605d1c0f 100644
--- a/src/config/Makefile.config.in
+++ b/src/config/Makefile.config.in
@@ -87,10 +87,10 @@ AR     = @AR@
 RANLIB = @RANLIB@
 
 LDFLAGS = @LDFLAGS@
-LIBS    = @LIBS@ ${CALIPER_LIBS} ${HYPRE_CUDA_LIBS} ${HYPRE_HIP_LIBS} ${HYPRE_RAJA_LIB_DIR} ${HYPRE_RAJA_LIB} ${HYPRE_KOKKOS_LIB_DIR} ${HYPRE_KOKKOS_LIB} ${HYPRE_UMPIRE_LIB_DIR} ${HYPRE_UMPIRE_LIB}
+LIBS    = @LIBS@ ${CALIPER_LIBS} ${HYPRE_CUDA_LIBS} ${HYPRE_HIP_LIBS} ${HYPRE_SYCL_LIBS} ${HYPRE_RAJA_LIB_DIR} ${HYPRE_RAJA_LIB} ${HYPRE_KOKKOS_LIB_DIR} ${HYPRE_KOKKOS_LIB} ${HYPRE_UMPIRE_LIB_DIR} ${HYPRE_UMPIRE_LIB}
 FLIBS   = @FLIBS@
 
-INCLUDES = ${CALIPER_INCLUDE} ${HYPRE_CUDA_INCLUDE} ${HYPRE_HIP_INCLUDE} ${HYPRE_RAJA_INCLUDE} ${HYPRE_KOKKOS_INCLUDE} ${HYPRE_UMPIRE_INCLUDE} ${HYPRE_NAP_INCLUDE}
+INCLUDES = ${CALIPER_INCLUDE} ${HYPRE_CUDA_INCLUDE} ${HYPRE_HIP_INCLUDE} ${HYPRE_SYCL_INCLUDE} ${HYPRE_RAJA_INCLUDE} ${HYPRE_KOKKOS_INCLUDE} ${HYPRE_UMPIRE_INCLUDE} ${HYPRE_NAP_INCLUDE}
 
 ##################################################################
 ##  LAPACK Library Flags
@@ -131,6 +131,12 @@ CUDA_ARCH          = @HYPRE_CUDA_GENCODE@
 HYPRE_HIP_INCLUDE = @HYPRE_HIP_INCL@
 HYPRE_HIP_LIBS    = @HYPRE_HIP_LIBS@
 
+##################################################################
+##  SYCL options
+##################################################################
+HYPRE_SYCL_INCLUDE=@HYPRE_SYCL_INCL@
+HYPRE_SYCL_LIBS=@HYPRE_SYCL_LIBS@
+
 ##################################################################
 ##  Caliper options
 ##################################################################
diff --git a/src/config/configure.in b/src/config/configure.in
index d54ac936d8..3f7acd524f 100644
--- a/src/config/configure.in
+++ b/src/config/configure.in
@@ -182,6 +182,16 @@ hypre_using_rocrand=no
 hypre_found_hip=no
 
 
+dnl *********************************************************************
+dnl * Initialize hypre-SYCL variables
+dnl *********************************************************************
+hypre_using_sycl=no
+hypre_using_onemklsparse=no
+hypre_using_onemklblas=no
+hypre_using_onemklrand=no
+
+hypre_found_sycl=no
+
 
 dnl *********************************************************************
 dnl * Initialize flag-check variables
@@ -1137,6 +1147,19 @@ AS_HELP_STRING([--with-hip],
 [hypre_using_hip=no]
 )
 
+dnl ***** SYCL
+AC_ARG_WITH(sycl,
+AS_HELP_STRING([--with-sycl],
+               [Use SYCL for Intel GPUs. (default is NO).]),
+[case "$withval" in
+    yes) hypre_using_sycl=yes ;;
+    no)  hypre_using_sycl=no ;;
+    *)   hypre_using_sycl=no ;;
+esac],
+[hypre_using_sycl=no]
+)
+
+
 AC_ARG_WITH(cuda-home,
 AS_HELP_STRING([--with-cuda-home=DIR],
                [User specifies CUDA_HOME in DIR.]),
@@ -1977,7 +2000,26 @@ AS_IF([ test x"$hypre_using_hip" == x"yes" ],
                           [AC_MSG_ERROR([unable to find ${HYPRE_ROCM_PREFIX}/include/hip/hip_common.h ... Ensure ROCm is installed and set ROCM_PATH environment variable to ROCm installation path.])] )],
       [])
 
+dnl *********************************************************************
+dnl * Check for SYCL header
+dnl *********************************************************************
 
+dnl If the user has requested to use SYCL, we first check the environment
+dnl for ONEAPI_PATH to point at the oneAPI installation. If that is not found,
+dnl then we default to `/opt/intel/oneapi`.
+dnl
+dnl TODO: Add an ARG_WITH for sycl so the user can control the oneAPI path
+dnl       through the configure line
+AS_IF([ test x"$hypre_using_sycl" == x"yes" ],
+      [ AS_IF([ test -n "$ONEAPI_PATH"],
+              [ HYPRE_SYCL_PREFIX=$ONEAPI_PATH ],
+              [ HYPRE_SYCL_PREFIX=/opt/intel/oneapi ])
+      
+      AC_SUBST(HYPRE_SYCL_PREFIX)
+      AC_CHECK_HEADERS( ["${HYPRE_SYCL_PREFIX}/compiler/latest/linux/include/sycl/CL/sycl.hpp"],
+                        [hypre_found_sycl=yes],
+                        [AC_MSG_ERROR([unable to find ${HYPRE_SYCL_PREFIX}/compiler/latest/linux/include/sycl/CL/sycl.hpp ... Ensure oneAPI SDK is installed and set ONEAPI_PATH environment variable to oneAPI installation path.])] )],
+      [])
 
 dnl *********************************************************************
 dnl * Set raja options
@@ -2241,6 +2283,67 @@ AS_IF([test x"$hypre_using_hip" == x"yes"],
       ]) dnl AS_IF([test x"$hypre_using_hip" == x"yes"]
 
 
+dnl *********************************************************************
+dnl * Set SYCL options
+dnl *********************************************************************
+AS_IF([test x"$hypre_user_chose_sycl" == x"yes"],
+      [
+        AC_DEFINE(HYPRE_USING_GPU,  1, [Define to 1 if executing on GPU device])
+        AC_DEFINE(HYPRE_USING_SYCL, 1, [SYCL being used])
+
+        dnl The actual invocation of the clang compiler from oneAPI that
+        dnl supports SYCL and all the command line foo needed by the compiler.
+        AC_CHECK_PROGS(CXX, [dpcpp])
+
+        dnl (Ab)Using dpcpp when compiling SYCL
+	     LINK_CC=${CXX}
+        LINK_CXX=${CXX}
+
+        dnl The "-x sycl" is necessary to override the detection of .c files which clang
+        dnl interprets as C and therefore invokes the C compiler rather than the SYCL part
+        dnl of clang. Put SYCLCXXFLAGS at the end so the user can override from
+        dnl from the configure line.
+        SYCLCXXFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel "
+
+        dnl If not in debug mode, at least -O2, but the user can override with
+        dnl with SYCLCXXFLAGS on the configure line. If in debug mode, -O0 -Wall
+        dnl plus flags for debugging symbols
+        AS_IF([test x"$hypre_using_debug" == x"yes"],
+              [SYCLCXXFLAGS="-O0 -Wall -g -gdb ${SYCLCXXFLAGS}"],
+              [SYCLCXXFLAGS="-O2 ${SYCLCXXFLAGS}"],)
+
+        dnl (Ab)Use CXXFLAGS to capture SYCL compilation flags
+        dnl Put SYCLCXXFLAGS at the end so the user can override the optimization level.
+        CXXFLAGS="${SYCLCXXFLAGS}"
+
+        dnl dpl, dpct so we need both for Thrust on Intel GPUs.
+        dnl These are header-only so no linking needed.
+        HYPRE_SYCL_INCL="-I${ONEAPI_PATH}/dpl/latest/linux/include"
+        HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${ONEAPI_PATH}/dpcpp-ct/latest/include"
+
+        dnl SYCL library
+        HYPRE_SYCL_LIBS="-L${HYPRE_SYCL_PREFIX}/lib -lamdsycl64"
+
+        AS_IF([test x"$hypre_using_onemklsparse" == x"yes"],
+              [AC_DEFINE(HYPRE_USING_ONEMKLSPARSE, 1, [onemkl::SPARSE being used])
+               HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
+               HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/mkl/spblas.hpp"
+               ])
+
+        AS_IF([test x"$hypre_using_onemklblas" == x"yes"],
+              [AC_DEFINE(HYPRE_USING_ONEMKLBLAS, 1, [onemkl::BLAS being used])
+	            HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
+               HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/blas.hpp"
+               ])
+
+        dnl onemklrand: random number generation on Intel GPUs
+        AS_IF([test x"$hypre_using_onemklrand" == x"yes"],
+              [AC_DEFINE(HYPRE_USING_ONEMKLRAND, 1, [onemkl::rng being used])
+               HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
+               HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/rng.hpp"
+               ])
+
+      ]) dnl AS_IF([test x"$hypre_user_chose_sycl" == x"yes"]
 
 
 dnl *********************************************************************
@@ -2265,6 +2368,14 @@ then
       AC_MSG_NOTICE([Use --enable-unified-memory to compile with unified memory.])
       AC_MSG_NOTICE([***********************************************************************])
    fi
+   if test "$hypre_user_chose_sycl" = "yes"
+   then
+      AC_MSG_NOTICE([***********************************************************])
+      AC_MSG_NOTICE([Configuring with --with-sycl=yes without unified memory.])
+      AC_MSG_NOTICE([It only works for struct interface.])
+      AC_MSG_NOTICE([Use --enable-unified-memory to compile with unified memory.])
+      AC_MSG_NOTICE([***********************************************************])
+   fi
    if test "$hypre_using_device_openmp" = "yes"
    then
       AC_MSG_NOTICE([***********************************************************************])
@@ -2512,6 +2623,12 @@ dnl *********************************************************************
 AC_SUBST(HYPRE_HIP_INCL)
 AC_SUBST(HYPRE_HIP_LIBS)
 
+dnl *********************************************************************
+dnl * SYCL stuff
+dnl *********************************************************************
+AC_SUBST(HYPRE_SYCL_INCL)
+AC_SUBST(HYPRE_SYCL_LIBS)
+
 dnl *********************************************************************
 dnl * Caliper instrumentation
 dnl *********************************************************************
diff --git a/src/configure b/src/configure
index 62b3f3fb26..cc2fee9bd2 100755
--- a/src/configure
+++ b/src/configure
@@ -633,6 +633,8 @@ SUPERLU_LIBS
 SUPERLU_INCLUDE
 CALIPER_LIBS
 CALIPER_INCLUDE
+HYPRE_SYCL_LIBS
+HYPRE_SYCL_INCL
 HYPRE_HIP_LIBS
 HYPRE_HIP_INCL
 HYPRE_CUDA_LIBS
@@ -830,6 +832,7 @@ with_mli
 with_MPI
 with_cuda
 with_hip
+with_sycl
 with_cuda_home
 with_gpu_arch
 with_raja
@@ -1635,6 +1638,7 @@ Optional Packages:
   --with-cuda             Use CUDA. Require cuda-8.0 or higher (default is
                           NO).
   --with-hip              Use HIP for AMD GPUs. (default is NO).
+  --with-sycl             Use SYCL for Intel GPUs. (default is NO).
   --with-cuda-home=DIR    User specifies CUDA_HOME in DIR.
   --with-gpu-arch=ARG     User specifies NVIDIA GPU architecture that the CUDA
                           files will be compiled for in ARG, where ARG is a
@@ -2750,6 +2754,13 @@ hypre_using_rocrand=no
 hypre_found_hip=no
 
 
+hypre_using_sycl=no
+hypre_using_onemklsparse=no
+hypre_using_onemklblas=no
+hypre_using_onemklrand=no
+
+hypre_found_sycl=no
+
 
 hypre_blas_lib_old_style=no
 hypre_blas_lib_dir_old_style=no
@@ -3953,6 +3964,22 @@ fi
 
 
 
+
+# Check whether --with-sycl was given.
+if test "${with_sycl+set}" = set; then :
+  withval=$with_sycl; case "$withval" in
+yes) hypre_using_sycl=yes ;;
+no)  hypre_using_sycl=no ;;
+*)   hypre_using_sycl=no ;;
+esac
+else
+  hypre_using_sycl=no
+
+fi
+
+
+
+
 # Check whether --with-cuda-home was given.
 if test "${with_cuda_home+set}" = set; then :
   withval=$with_cuda_home; for cuda_dir in $withval; do
@@ -8536,7 +8563,7 @@ fi
 
 if test "x$hypre_using_um" = "xyes"
 then
-   if test "x$hypre_using_cuda" != "xyes" && test "x$hypre_using_device_openmp" != "xyes" && test "x$hypre_using_hip" != "xyes"
+   if test "x$hypre_using_cuda" != "xyes" && test "x$hypre_using_device_openmp" != "xyes" && test "x$hypre_using_hip" != "xyes" && test "x$hypre_using_sycl" != "xyes"
    then
       as_fn_error $? "Asked for unified memory, but not using CUDA, HIP, or device OpenMP!" "$LINENO" 5
    fi
@@ -9010,6 +9037,105 @@ fi
 fi
 
 
+if test x"$hypre_using_sycl" == x"yes"; then :
+
+# WM: not setting this with sycl for now since it is giving me problems
+$as_echo "#define HYPRE_USING_GPU 1" >>confdefs.h
+
+$as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h
+
+
+        	        for ac_prog in dpcpp
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CUCC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CUCC"; then
+  ac_cv_prog_CUCC="$CUCC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CUCC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi
+fi
+CUCC=$ac_cv_prog_CUCC
+if test -n "$CUCC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUCC" >&5
+$as_echo "$CUCC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CUCC" && break
+done
+
+
+        	LINK_CC=${CUCC}
+        LINK_CXX=${CUCC}
+
+                                        SYCLCXXFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel "
+
+                                if test x"$hypre_using_debug" == x"yes"; then :
+  SYCLCXXFLAGS="-O0 -Wall -g -gdbx ${SYCLCXXFLAGS}"
+elif SYCLCXXFLAGS="-O2 ${SYCLCXXFLAGS}"; then :
+
+fi
+
+                        CUFLAGS="${SYCLCXXFLAGS}"
+
+                        HYPRE_SYCL_INCL="-I${ONEAPI_PATH}/dpl/latest/linux/include"
+        HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${ONEAPI_PATH}/dpcpp-ct/latest/include"
+
+
+        if test x"$hypre_using_onemklsparse" == x"yes"; then :
+
+$as_echo "#define HYPRE_USING_ONEMKLSPARSE 1" >>confdefs.h
+
+               HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
+               HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/mkl/spblas.hpp"
+
+fi
+
+        if test x"$hypre_using_onemklblas" == x"yes"; then :
+
+$as_echo "#define HYPRE_USING_ONEMKLBLAS 1" >>confdefs.h
+
+	       HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
+               HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/blas.hpp"
+
+fi
+
+                if test x"$hypre_using_onemklrand" == x"yes"; then :
+
+$as_echo "#define HYPRE_USING_ONEMKLRAND 1" >>confdefs.h
+
+               HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
+               HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/rng.hpp"
+
+fi
+
+
+fi
+
+
 
 if test "$hypre_using_um" != "yes"
 then
@@ -9038,6 +9164,19 @@ $as_echo "$as_me: It only works for structured solvers and selected unstructured
 $as_echo "$as_me: Use --enable-unified-memory to compile with unified memory." >&6;}
       { $as_echo "$as_me:${as_lineno-$LINENO}: ***********************************************************************" >&5
 $as_echo "$as_me: ***********************************************************************" >&6;}
+   fi
+   if test "$hypre_using_sycl" = "yes"
+   then
+      { $as_echo "$as_me:${as_lineno-$LINENO}: ***********************************************************" >&5
+$as_echo "$as_me: ***********************************************************" >&6;}
+      { $as_echo "$as_me:${as_lineno-$LINENO}: Configuring with --with-sycl=yes without unified memory." >&5
+$as_echo "$as_me: Configuring with --with-sycl=yes without unified memory." >&6;}
+      { $as_echo "$as_me:${as_lineno-$LINENO}: It only works for struct interface." >&5
+$as_echo "$as_me: It only works for struct interface." >&6;}
+      { $as_echo "$as_me:${as_lineno-$LINENO}: Use --enable-unified-memory to compile with unified memory." >&5
+$as_echo "$as_me: Use --enable-unified-memory to compile with unified memory." >&6;}
+      { $as_echo "$as_me:${as_lineno-$LINENO}: ***********************************************************" >&5
+$as_echo "$as_me: ***********************************************************" >&6;}
    fi
    if test "$hypre_using_device_openmp" = "yes"
    then
diff --git a/src/test/Makefile b/src/test/Makefile
index 8922d0ab2d..10c3ac32cf 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -64,6 +64,7 @@ LFLAGS =\
 
 HYPRE_DRIVERS =\
  ij.c\
+ simple.c\
  ij_assembly.c\
  sstruct.c\
  struct.c\
@@ -143,6 +144,10 @@ ij: ij.${OBJ_SUFFIX}
 	@echo  "Building" $@ "... "
 	${LINK_CC} -o $@ $< ${LFLAGS}
 
+simple: simple.${OBJ_SUFFIX}
+	@echo  "Building" $@ "... "
+	${LINK_CC} -o $@ $< ${LFLAGS}
+
 ij_assembly: ij_assembly.${OBJ_SUFFIX}
 	@echo  "Building" $@ "... "
 	${LINK_CC} -o $@ $< ${LFLAGS}
diff --git a/src/utilities/Makefile b/src/utilities/Makefile
index 8281d38f33..4b37e1cd3e 100644
--- a/src/utilities/Makefile
+++ b/src/utilities/Makefile
@@ -62,8 +62,9 @@ CUFILES=\
  general.c\
  handle.c\
  memory.c\
- omp_device.c \
- nvtx.c
+ omp_device.c\
+ nvtx.c\
+ sycl_utils.c
 
 COBJS = ${FILES:.c=.o}
 CUOBJS = ${CUFILES:.c=.obj}
diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h
index c2a56322d2..503c13f2d3 100644
--- a/src/utilities/_hypre_utilities.h
+++ b/src/utilities/_hypre_utilities.h
@@ -1209,6 +1209,9 @@ static char hypre__levelname[16];
 struct hypre_CudaData;
 typedef struct hypre_CudaData hypre_CudaData;
 
+struct hypre_SyclData;
+typedef struct hypre_SyclData hypre_SyclData;
+
 typedef struct
 {
    HYPRE_Int              hypre_error;
@@ -1234,6 +1237,9 @@ typedef struct
    HYPRE_Int              own_umpire_pinned_pool;
    umpire_resourcemanager umpire_rm;
 #endif
+#if defined(HYPRE_USING_SYCL)
+   hypre_SyclData        *sycl_data;
+#endif
 } hypre_Handle;
 
 /* accessor macros to hypre_Handle */
@@ -1241,6 +1247,7 @@ typedef struct
 #define hypre_HandleDefaultExecPolicy(hypre_handle)              ((hypre_handle) -> default_exec_policy)
 #define hypre_HandleStructExecPolicy(hypre_handle)               ((hypre_handle) -> struct_exec_policy)
 #define hypre_HandleCudaData(hypre_handle)                       ((hypre_handle) -> cuda_data)
+#define hypre_HandleSyclData(hypre_handle)                       ((hypre_handle) -> sycl_data)
 
 #define hypre_HandleCurandGenerator(hypre_handle)                hypre_CudaDataCurandGenerator(hypre_HandleCudaData(hypre_handle))
 #define hypre_HandleCublasHandle(hypre_handle)                   hypre_CudaDataCublasHandle(hypre_HandleCudaData(hypre_handle))
@@ -1283,6 +1290,9 @@ typedef struct
 #define hypre_HandleOwnUmpireHostPool(hypre_handle)              ((hypre_handle) -> own_umpire_host_pool)
 #define hypre_HandleOwnUmpirePinnedPool(hypre_handle)            ((hypre_handle) -> own_umpire_pinned_pool)
 
+#define hypre_HandleSyclComputeQueue(hypre_handle)               hypre_SyclDataSyclComputeQueue(hypre_HandleSyclData(hypre_handle))
+#define hypre_HandleSyclDevice(hypre_handle)                     hypre_SyclDataSyclDevice(hypre_HandleSyclData(hypre_handle))
+#define hypre_HandleSyclComputeQueueNum(hypre_handle)            hypre_SyclDataSyclComputeQueueNum(hypre_HandleSyclData(hypre_handle))
 #endif
 /******************************************************************************
  * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index 75fe8ecb02..4d4eed5220 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -2111,6 +2111,81 @@ struct hypre_cub_CachingDeviceAllocator
 #endif // #if defined(HYPRE_USING_CUDA) && defined(HYPRE_USING_DEVICE_POOL)
 #endif // #ifndef HYPRE_CUB_ALLOCATOR_HEADER
 
+/******************************************************************************
+ * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
+ * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ ******************************************************************************/
+
+#ifndef HYPRE_SYCL_UTILS_HPP
+#define HYPRE_SYCL_UTILS_HPP
+
+#if defined(HYPRE_USING_SYCL)
+
+/* #include <oneapi/dpl/execution> */
+/* #include <oneapi/dpl/algorithm> */
+/* #include <oneapi/dpl/iterator> */
+/* #include <oneapi/dpl/functional> */
+
+/* #include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if */
+
+/* #include <algorithm> */
+/* #include <numeric> */
+/* #include <functional> */
+/* #include <iterator> */
+
+#include <CL/sycl.hpp>
+/* #include <oneapi/mkl.hpp> */
+/* #include <oneapi/mkl/rng/device.hpp> */
+
+#define HYPRE_SYCL_CALL( EXPR )                                       \
+   try                                                                \
+   {                                                                  \
+      EXPR;                                                           \
+   }                                                                  \
+   catch (sycl::exception const &ex)                                  \
+   {                                                                  \
+      hypre_printf("SYCL ERROR (code = %s) at %s:%d\n", ex.what(),    \
+                     __FILE__, __LINE__);                             \
+      assert(0); exit(1);                                             \
+   }                                                                  \
+   catch(std::runtime_error const& ex) {                              \
+      hypre_printf("STD ERROR (code = %s) at %s:%d\n", ex.what(),     \
+                   __FILE__, __LINE__);                               \
+      assert(0); exit(1);                                             \
+   }
+
+// HYPRE_SUBGROUP_BITSHIFT is just log2 of HYPRE_SUBGROUP_SIZE
+#define HYPRE_SUBGROUP_SIZE       32
+#define HYPRE_SUBGROUP_BITSHIFT   5
+#define HYPRE_MAX_NUM_SUBGROUPS   (64 * 64 * 32)
+#define HYPRE_FLT_LARGE           1e30
+#define HYPRE_1D_BLOCK_SIZE       512
+#define HYPRE_MAX_NUM_QUEUES      10
+
+struct hypre_SyclData
+{
+   sycl::queue*                      sycl_queues[HYPRE_MAX_NUM_QUEUES] = {};
+   sycl::device                      sycl_device;
+
+   /* by default, hypre puts GPU computations in this queue
+    * Do not be confused with the default (null) SYCL queue */
+   HYPRE_Int                         sycl_compute_queue_num;
+};
+
+#define hypre_SyclDataSyclDevice(data)                     ((data) -> sycl_device)
+#define hypre_SyclDataSyclComputeQueueNum(data)            ((data) -> sycl_compute_queue_num)
+
+hypre_SyclData* hypre_SyclDataCreate();
+void hypre_SyclDataDestroy(hypre_SyclData* data);
+
+sycl::queue *hypre_SyclDataSyclQueue(hypre_SyclData *data, HYPRE_Int i);
+sycl::queue *hypre_SyclDataSyclComputeQueue(hypre_SyclData *data);
+
+#endif // #if defined(HYPRE_USING_SYCL)
+
+#endif /* #ifndef HYPRE_SYCL_UTILS_HPP */
 
 #ifdef __cplusplus
 }
diff --git a/src/utilities/general.c b/src/utilities/general.c
index f52820f5e6..11d747afad 100644
--- a/src/utilities/general.c
+++ b/src/utilities/general.c
@@ -54,6 +54,12 @@ hypre_HandleCreate()
    hypre_HandleCudaData(hypre_handle_) = hypre_CudaDataCreate();
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+   hypre_HandleDefaultExecPolicy(hypre_handle_) = HYPRE_EXEC_HOST;
+   hypre_HandleStructExecPolicy(hypre_handle_) = HYPRE_EXEC_HOST;
+   hypre_HandleSyclData(hypre_handle_) = hypre_SyclDataCreate();
+#endif
+
    return hypre_handle_;
 }
 
@@ -69,6 +75,10 @@ hypre_HandleDestroy(hypre_Handle *hypre_handle_)
    hypre_CudaDataDestroy(hypre_HandleCudaData(hypre_handle_));
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+   hypre_SyclDataDestroy(hypre_HandleSyclData(hypre_handle_));
+#endif
+
    hypre_TFree(hypre_handle_, HYPRE_MEMORY_HOST);
 
    return hypre_error_flag;
@@ -98,6 +108,67 @@ hypre_SetDevice(hypre_int device_id, hypre_Handle *hypre_handle_)
    }
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+   // WM: TODO - this ain't it...
+   hypre_int nDevices=0;
+   sycl::platform platform(sycl::gpu_selector{});
+   auto const& gpu_devices = platform.get_devices();
+   for (int i = 0; i < gpu_devices.size(); i++)
+   {
+      if (gpu_devices[i].is_gpu())
+      {
+         if(gpu_devices[i].get_info<sycl::info::device::partition_max_sub_devices>() > 0)
+         {
+	         auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(
+	                                     sycl::info::partition_affinity_domain::numa);
+	         nDevices += subDevicesDomainNuma.size();
+         }
+         else
+         {
+	         nDevices++;
+         }
+      }
+   }
+
+   if (device_id > nDevices)
+   {
+      // WM: debug
+      hypre_printf("device_id = %d, nDevices = %d\n", device_id, nDevices);
+      hypre_printf("ERROR: SYCL device-ID exceed the number of devices on-node... \n");
+   }
+
+   HYPRE_Int local_nDevices=0;
+   for (int i = 0; i < gpu_devices.size(); i++)
+   {
+      if (gpu_devices[i].is_gpu())
+      {
+         // multi-tile GPUs
+         if (gpu_devices[i].get_info<sycl::info::device::partition_max_sub_devices>() > 0)
+         {
+            auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(
+                                        sycl::info::partition_affinity_domain::numa);
+            for (const auto &tile : subDevicesDomainNuma)
+            {
+               if (local_nDevices == device_id)
+               {
+                  hypre_HandleSyclDevice(hypre_handle_) = tile;
+               }
+               local_nDevices++;
+            }
+         }
+         // single-tile GPUs
+         else
+         {
+            if (local_nDevices == device_id)
+            {
+               hypre_HandleSyclDevice(hypre_handle_) = gpu_devices[i];
+            }
+            local_nDevices++;
+         }
+      }
+   }
+#endif
+
    return hypre_error_flag;
 }
 
@@ -119,6 +190,10 @@ hypre_GetDevice(hypre_int *device_id)
    HYPRE_HIP_CALL( hipGetDevice(device_id) );
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+   // WM: TODO
+#endif
+
    return hypre_error_flag;
 }
 
@@ -137,6 +212,28 @@ hypre_GetDeviceCount(hypre_int *device_count)
    HYPRE_HIP_CALL( hipGetDeviceCount(device_count) );
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+   // WM: TODO - verify
+   sycl::platform platform(sycl::gpu_selector{});
+   auto const& gpu_devices = platform.get_devices();
+   for (int i = 0; i < gpu_devices.size(); i++)
+   {
+      if (gpu_devices[i].is_gpu())
+      {
+         if(gpu_devices[i].get_info<sycl::info::device::partition_max_sub_devices>() > 0)
+         {
+            auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(
+                                        sycl::info::partition_affinity_domain::numa);
+            (*device_count) += subDevicesDomainNuma.size();
+         }
+         else
+         {
+	         (*device_count)++;
+         }
+      }
+   }
+#endif
+
    return hypre_error_flag;
 }
 
@@ -155,6 +252,10 @@ hypre_GetDeviceLastError()
    HYPRE_HIP_CALL( hipGetLastError() );
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+   // WM: TODO
+#endif
+
    return hypre_error_flag;
 }
 
@@ -179,7 +280,7 @@ HYPRE_Init()
       _hypre_handle = hypre_HandleCreate();
    }
 
-#if defined(HYPRE_USING_GPU)
+#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL)
    hypre_GetDeviceLastError();
 
    /* Notice: the cudaStream created is specific to the device
@@ -192,7 +293,12 @@ HYPRE_Init()
 
    /* To include the cost of creating streams/cudahandles in HYPRE_Init */
    /* If not here, will be done at the first use */
+#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
    hypre_HandleCudaComputeStream(_hypre_handle);
+#endif
+#if defined(HYPRE_USING_SYCL)
+   hypre_HandleSyclComputeQueue(_hypre_handle);
+#endif
 
    /* A separate stream for prefetching */
    //hypre_HandleCudaPrefetchStream(_hypre_handle);
@@ -298,6 +404,10 @@ HYPRE_PrintDeviceInfo()
   hypre_printf("Running on \"%s\", major %d, minor %d, total memory %.2f GB\n", deviceProp.name, deviceProp.major, deviceProp.minor, deviceProp.totalGlobalMem/1e9);
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+  // WM: TODO
+#endif
+
    return hypre_error_flag;
 }
 
diff --git a/src/utilities/handle.h b/src/utilities/handle.h
index 2055073e8e..2c2bccfcc8 100644
--- a/src/utilities/handle.h
+++ b/src/utilities/handle.h
@@ -17,6 +17,9 @@
 struct hypre_CudaData;
 typedef struct hypre_CudaData hypre_CudaData;
 
+struct hypre_SyclData;
+typedef struct hypre_SyclData hypre_SyclData;
+
 typedef struct
 {
    HYPRE_Int              hypre_error;
@@ -42,6 +45,9 @@ typedef struct
    HYPRE_Int              own_umpire_pinned_pool;
    umpire_resourcemanager umpire_rm;
 #endif
+#if defined(HYPRE_USING_SYCL)
+   hypre_SyclData        *sycl_data;
+#endif
 } hypre_Handle;
 
 /* accessor macros to hypre_Handle */
@@ -49,6 +55,7 @@ typedef struct
 #define hypre_HandleDefaultExecPolicy(hypre_handle)              ((hypre_handle) -> default_exec_policy)
 #define hypre_HandleStructExecPolicy(hypre_handle)               ((hypre_handle) -> struct_exec_policy)
 #define hypre_HandleCudaData(hypre_handle)                       ((hypre_handle) -> cuda_data)
+#define hypre_HandleSyclData(hypre_handle)                       ((hypre_handle) -> sycl_data)
 
 #define hypre_HandleCurandGenerator(hypre_handle)                hypre_CudaDataCurandGenerator(hypre_HandleCudaData(hypre_handle))
 #define hypre_HandleCublasHandle(hypre_handle)                   hypre_CudaDataCublasHandle(hypre_HandleCudaData(hypre_handle))
@@ -91,4 +98,7 @@ typedef struct
 #define hypre_HandleOwnUmpireHostPool(hypre_handle)              ((hypre_handle) -> own_umpire_host_pool)
 #define hypre_HandleOwnUmpirePinnedPool(hypre_handle)            ((hypre_handle) -> own_umpire_pinned_pool)
 
+#define hypre_HandleSyclComputeQueue(hypre_handle)               hypre_SyclDataSyclComputeQueue(hypre_HandleSyclData(hypre_handle))
+#define hypre_HandleSyclDevice(hypre_handle)                     hypre_SyclDataSyclDevice(hypre_HandleSyclData(hypre_handle))
+#define hypre_HandleSyclComputeQueueNum(hypre_handle)            hypre_SyclDataSyclComputeQueueNum(hypre_HandleSyclData(hypre_handle))
 #endif
diff --git a/src/utilities/headers b/src/utilities/headers
index cf74476aa0..0c96b33fae 100755
--- a/src/utilities/headers
+++ b/src/utilities/headers
@@ -92,6 +92,7 @@ cat umpire_allocator.h     >> $INTERNAL_HEADER
 cat cuda_utils.h           >> $INTERNAL_HEADER
 cat cuda_reducer.h         >> $INTERNAL_HEADER
 cat cub_allocator.h        >> $INTERNAL_HEADER
+cat sycl_utils.h           >> $INTERNAL_HEADER
 
 #===========================================================================
 # Include guards
diff --git a/src/utilities/memory.c b/src/utilities/memory.c
index 41005fe8dd..5dc5af7ea8 100644
--- a/src/utilities/memory.c
+++ b/src/utilities/memory.c
@@ -76,6 +76,10 @@ hypre_DeviceMemset(void *ptr, HYPRE_Int value, size_t num)
 #if defined(HYPRE_USING_HIP)
    HYPRE_HIP_CALL( hipMemset(ptr, value, num) );
 #endif
+
+#if defined(HYPRE_USING_SYCL)
+   (hypre_HandleSyclComputeQueue(hypre_handle()))->memset(ptr, value, num).wait();
+#endif
 }
 
 static inline void
@@ -93,6 +97,10 @@ hypre_UnifiedMemset(void *ptr, HYPRE_Int value, size_t num)
 #if defined(HYPRE_USING_HIP)
    HYPRE_HIP_CALL( hipMemset(ptr, value, num) );
 #endif
+
+#if defined(HYPRE_USING_SYCL)
+   (hypre_HandleSyclComputeQueue(hypre_handle()))->memset(ptr, value, num).wait();
+#endif
 }
 
 /*--------------------------------------------------------------------------
@@ -152,6 +160,10 @@ hypre_UnifiedMemPrefetch(void *ptr, size_t size, hypre_MemoryLocation location)
     *}
     */
 #endif
+
+#if defined(HYPRE_USING_SYCL)
+   // WM: TODO
+#endif
 }
 
 /*--------------------------------------------------------------------------
@@ -215,6 +227,10 @@ hypre_DeviceMalloc(size_t size, HYPRE_Int zeroinit)
    HYPRE_HIP_CALL( hipMalloc(&ptr, size) );
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+   ptr = (void *)sycl::malloc_device(size, *(hypre_HandleSyclComputeQueue(hypre_handle())));
+#endif
+
 #endif /* #if defined(HYPRE_USING_UMPIRE_DEVICE) */
 
    if (ptr && zeroinit)
@@ -250,6 +266,10 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit)
    HYPRE_HIP_CALL( hipMallocManaged(&ptr, size, hipMemAttachGlobal) );
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+   ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleSyclComputeQueue(hypre_handle())));
+#endif
+
 #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */
 
    /* prefecth to device */
@@ -287,6 +307,10 @@ hypre_HostPinnedMalloc(size_t size, HYPRE_Int zeroinit)
    HYPRE_HIP_CALL( hipHostMalloc(&ptr, size) );
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+   ptr = (void *)sycl::malloc_host(size, *(hypre_HandleSyclComputeQueue(hypre_handle())));
+#endif
+
 #endif /* #if defined(HYPRE_USING_UMPIRE_PINNED) */
 
    if (ptr && zeroinit)
@@ -380,6 +404,10 @@ hypre_DeviceFree(void *ptr)
    HYPRE_HIP_CALL( hipFree(ptr) );
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+   sycl::free(ptr, *(hypre_HandleSyclComputeQueue(hypre_handle())));
+#endif
+
 #endif /* #if defined(HYPRE_USING_UMPIRE_DEVICE) */
 }
 
@@ -406,6 +434,10 @@ hypre_UnifiedFree(void *ptr)
    HYPRE_HIP_CALL( hipFree(ptr) );
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+   sycl::free(ptr, *(hypre_HandleSyclComputeQueue(hypre_handle())));
+#endif
+
 #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */
 }
 
@@ -428,6 +460,10 @@ hypre_HostPinnedFree(void *ptr)
    HYPRE_HIP_CALL( hipHostFree(ptr) );
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+   sycl::free(ptr, *(hypre_HandleSyclComputeQueue(hypre_handle())));
+#endif
+
 #endif /* #if defined(HYPRE_USING_UMPIRE_PINNED) */
 }
 
@@ -479,6 +515,10 @@ _hypre_Free(void *ptr, hypre_MemoryLocation location)
 static inline void
 hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_dst, hypre_MemoryLocation loc_src)
 {
+#if defined(HYPRE_USING_SYCL)
+   sycl::queue* q = hypre_HandleSyclComputeQueue(hypre_handle());
+#endif
+
    if (dst == NULL || src == NULL)
    {
       if (size)
@@ -524,6 +564,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #if defined(HYPRE_USING_HIP)
       HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyDeviceToDevice) );
 #endif
+
+#if defined(HYPRE_USING_SYCL)
+      q->memcpy(dst, src, size).wait();
+#endif
       return;
    }
 
@@ -542,6 +586,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #if defined(HYPRE_USING_HIP)
       HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyHostToDevice) );
 #endif
+
+#if defined(HYPRE_USING_SYCL)
+      q->memcpy(dst, src, size).wait();
+#endif
       return;
    }
 
@@ -560,6 +608,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #if defined(HYPRE_USING_HIP)
       HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyDeviceToHost) );
 #endif
+
+#if defined(HYPRE_USING_SYCL)
+      q->memcpy(dst, src, size).wait();
+#endif
       return;
    }
 
@@ -583,6 +635,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #if defined(HYPRE_USING_HIP)
       HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyHostToDevice) );
 #endif
+
+#if defined(HYPRE_USING_SYCL)
+      q->memcpy(dst, src, size).wait();
+#endif
       return;
    }
 
@@ -606,6 +662,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #if defined(HYPRE_USING_HIP)
       HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyDeviceToHost) );
 #endif
+
+#if defined(HYPRE_USING_SYCL)
+      q->memcpy(dst, src, size).wait();
+#endif
       return;
    }
 
@@ -630,6 +690,10 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #if defined(HYPRE_USING_HIP)
       HYPRE_HIP_CALL( hipMemcpy(dst, src, size, hipMemcpyDeviceToDevice) );
 #endif
+
+#if defined(HYPRE_USING_SYCL)
+      q->memcpy(dst, src, size).wait();
+#endif
       return;
    }
 
@@ -654,7 +718,7 @@ hypre_GetExecPolicy1_core(hypre_MemoryLocation location)
          exec = HYPRE_EXEC_DEVICE;
          break;
       case hypre_MEMORY_UNIFIED :
-#if defined(HYPRE_USING_GPU)
+#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL)
          exec = hypre_HandleDefaultExecPolicy(hypre_handle());
 #endif
          break;
@@ -701,7 +765,7 @@ hypre_GetExecPolicy2_core(hypre_MemoryLocation location1,
 
    if (location1 == hypre_MEMORY_UNIFIED && location2 == hypre_MEMORY_UNIFIED)
    {
-#if defined(HYPRE_USING_GPU)
+#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL)
       exec = hypre_HandleDefaultExecPolicy(hypre_handle());
 #endif
    }
@@ -907,7 +971,7 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
 {
    HYPRE_Int ierr = 0;
 
-#if defined(HYPRE_USING_GPU)
+#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL)
    *memory_location = hypre_MEMORY_UNDEFINED;
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
@@ -1002,7 +1066,30 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
    }
 #endif // defined(HYPRE_USING_HIP)
 
-#else /* #if defined(HYPRE_USING_GPU) */
+#if defined(HYPRE_USING_SYCL)
+   *memory_location = hypre_MEMORY_UNDEFINED;
+   sycl::usm::alloc allocType;
+   allocType = sycl::get_pointer_type(ptr, (hypre_HandleSyclComputeQueue(hypre_handle()))->get_context());
+
+   if (allocType == sycl::usm::alloc::unknown)
+   {
+      *memory_location = hypre_MEMORY_HOST;
+   }
+   else if (allocType == sycl::usm::alloc::host)
+   {
+      *memory_location = hypre_MEMORY_HOST_PINNED;
+   }
+   else if (allocType == sycl::usm::alloc::device)
+   {
+      *memory_location = hypre_MEMORY_DEVICE;
+   }
+   else if (allocType == sycl::usm::alloc::shared)
+   {
+      *memory_location = hypre_MEMORY_UNIFIED;
+   }
+#endif //HYPRE_USING_SYCL
+
+#else /* #if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL) */
    *memory_location = hypre_MEMORY_HOST;
 #endif
 

From 61d0edbd021044e98f099856db5bb2329e403706 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <wbm@jlselogin7.ftm.alcf.anl.gov>
Date: Tue, 27 Jul 2021 20:57:38 +0000
Subject: [PATCH 02/44] Change names and fix initialization

This does a bunch of name changing of files, data structures,
and variables from 'cuda' to 'device' in order to reflect which
things are generic device functionality vs. tied to a specific
language. In addition, this now compiles and runs a simple program
that calls HYPRE_Init() and allocates/copies/frees memory on the
device and host with unified memory.
---
 src/seq_mv/csr_matvec_device.c                |   3 +-
 src/struct_ls/HYPRE_struct_int.c              |   4 +-
 src/utilities/CMakeLists.txt                  |   4 +-
 src/utilities/Makefile                        |   5 +-
 src/utilities/_hypre_utilities.h              |  70 ++---
 src/utilities/_hypre_utilities.hpp            | 256 +++++++++---------
 .../{cuda_reducer.h => device_reducer.h}      |   6 +-
 .../{cuda_utils.c => device_utils.c}          | 140 ++++++----
 .../{cuda_utils.h => device_utils.h}          | 175 ++++++++----
 src/utilities/general.c                       | 101 ++-----
 src/utilities/handle.h                        |  68 ++---
 src/utilities/headers                         |   5 +-
 src/utilities/memory.c                        |  60 ++--
 13 files changed, 464 insertions(+), 433 deletions(-)
 rename src/utilities/{cuda_reducer.h => device_reducer.h} (96%)
 rename src/utilities/{cuda_utils.c => device_utils.c} (90%)
 rename src/utilities/{cuda_utils.h => device_utils.h} (79%)

diff --git a/src/seq_mv/csr_matvec_device.c b/src/seq_mv/csr_matvec_device.c
index cd273fd938..4751d7c384 100644
--- a/src/seq_mv/csr_matvec_device.c
+++ b/src/seq_mv/csr_matvec_device.c
@@ -51,7 +51,8 @@ hypre_CSRMatrixMatvecDevice2( HYPRE_Int        trans,
 #elif defined(HYPRE_USING_ROCSPARSE)
    hypre_CSRMatrixMatvecRocsparse(trans, alpha, A, x, beta, y, offset);
 #else // #ifdef HYPRE_USING_CUSPARSE
-#error HYPRE SPMV TODO
+// WM: TODO: commenting this out for now, but put it back after sycl impelentation is done
+/* #error HYPRE SPMV TODO */
 #endif
 
    return hypre_error_flag;
diff --git a/src/struct_ls/HYPRE_struct_int.c b/src/struct_ls/HYPRE_struct_int.c
index e9048acbf7..abb1869fcd 100644
--- a/src/struct_ls/HYPRE_struct_int.c
+++ b/src/struct_ls/HYPRE_struct_int.c
@@ -71,7 +71,9 @@ hypre_StructVectorSetRandomValues( hypre_StructVector *vector,
       hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                           v_data_box, start, unit_stride, vi);
       {
-#if defined(HYPRE_USING_GPU)
+// WM: TODO: temporary fix... remove after sycl implementation is done
+#if defined(HYPRE_USING_SYCL)
+#elif defined(HYPRE_USING_GPU)
          vp[vi] = rand_device[idx];
 #else
          vp[vi] = 2.0*hypre_Rand() - 1.0;
diff --git a/src/utilities/CMakeLists.txt b/src/utilities/CMakeLists.txt
index 33c00cc53a..cd51132a21 100644
--- a/src/utilities/CMakeLists.txt
+++ b/src/utilities/CMakeLists.txt
@@ -22,7 +22,7 @@ set(SRCS
   fortran_matrix.c
   ap.c
   complex.c
-  cuda_utils.c
+  device_utils.c
   error.c
   general.c
   handle.c
@@ -52,7 +52,7 @@ target_sources(${PROJECT_NAME}
 if (HYPRE_USING_CUDA)
   set(CUDA_SRCS
     HYPRE_handle.c
-    cuda_utils.c
+    device_utils.c
     handle.c
     general.c
     memory.c
diff --git a/src/utilities/Makefile b/src/utilities/Makefile
index 4b37e1cd3e..07581dd7d4 100644
--- a/src/utilities/Makefile
+++ b/src/utilities/Makefile
@@ -58,13 +58,12 @@ FILES =\
  timing.c
 
 CUFILES=\
- cuda_utils.c\
+ device_utils.c\
  general.c\
  handle.c\
  memory.c\
  omp_device.c\
- nvtx.c\
- sycl_utils.c
+ nvtx.c
 
 COBJS = ${FILES:.c=.o}
 CUOBJS = ${CUFILES:.c=.obj}
diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h
index 503c13f2d3..b2da3c2bca 100644
--- a/src/utilities/_hypre_utilities.h
+++ b/src/utilities/_hypre_utilities.h
@@ -1206,11 +1206,8 @@ static char hypre__levelname[16];
 #ifndef HYPRE_HANDLE_H
 #define HYPRE_HANDLE_H
 
-struct hypre_CudaData;
-typedef struct hypre_CudaData hypre_CudaData;
-
-struct hypre_SyclData;
-typedef struct hypre_SyclData hypre_SyclData;
+struct hypre_DeviceData;
+typedef struct hypre_DeviceData hypre_DeviceData;
 
 typedef struct
 {
@@ -1219,7 +1216,7 @@ typedef struct
    HYPRE_ExecutionPolicy  default_exec_policy;
    HYPRE_ExecutionPolicy  struct_exec_policy;
 #if defined(HYPRE_USING_GPU)
-   hypre_CudaData        *cuda_data;
+   hypre_DeviceData        *device_data;
 #endif
 #if defined(HYPRE_USING_UMPIRE)
    char                   umpire_device_pool_name[HYPRE_UMPIRE_POOL_NAME_MAX_LEN];
@@ -1237,43 +1234,39 @@ typedef struct
    HYPRE_Int              own_umpire_pinned_pool;
    umpire_resourcemanager umpire_rm;
 #endif
-#if defined(HYPRE_USING_SYCL)
-   hypre_SyclData        *sycl_data;
-#endif
 } hypre_Handle;
 
 /* accessor macros to hypre_Handle */
 #define hypre_HandleMemoryLocation(hypre_handle)                 ((hypre_handle) -> memory_location)
 #define hypre_HandleDefaultExecPolicy(hypre_handle)              ((hypre_handle) -> default_exec_policy)
 #define hypre_HandleStructExecPolicy(hypre_handle)               ((hypre_handle) -> struct_exec_policy)
-#define hypre_HandleCudaData(hypre_handle)                       ((hypre_handle) -> cuda_data)
-#define hypre_HandleSyclData(hypre_handle)                       ((hypre_handle) -> sycl_data)
-
-#define hypre_HandleCurandGenerator(hypre_handle)                hypre_CudaDataCurandGenerator(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCublasHandle(hypre_handle)                   hypre_CudaDataCublasHandle(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCusparseHandle(hypre_handle)                 hypre_CudaDataCusparseHandle(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCudaComputeStream(hypre_handle)              hypre_CudaDataCudaComputeStream(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCubBinGrowth(hypre_handle)                   hypre_CudaDataCubBinGrowth(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCubMinBin(hypre_handle)                      hypre_CudaDataCubMinBin(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCubMaxBin(hypre_handle)                      hypre_CudaDataCubMaxBin(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCubMaxCachedBytes(hypre_handle)              hypre_CudaDataCubMaxCachedBytes(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCubDevAllocator(hypre_handle)                hypre_CudaDataCubDevAllocator(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCubUvmAllocator(hypre_handle)                hypre_CudaDataCubUvmAllocator(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCudaDevice(hypre_handle)                     hypre_CudaDataCudaDevice(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCudaComputeStreamNum(hypre_handle)           hypre_CudaDataCudaComputeStreamNum(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCudaReduceBuffer(hypre_handle)               hypre_CudaDataCudaReduceBuffer(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleStructCommRecvBuffer(hypre_handle)           hypre_CudaDataStructCommRecvBuffer(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleStructCommSendBuffer(hypre_handle)           hypre_CudaDataStructCommSendBuffer(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleStructCommRecvBufferSize(hypre_handle)       hypre_CudaDataStructCommRecvBufferSize(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleStructCommSendBufferSize(hypre_handle)       hypre_CudaDataStructCommSendBufferSize(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleSpgemmUseCusparse(hypre_handle)              hypre_CudaDataSpgemmUseCusparse(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleSpgemmNumPasses(hypre_handle)                hypre_CudaDataSpgemmNumPasses(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle)     hypre_CudaDataSpgemmRownnzEstimateMethod(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle)   hypre_CudaDataSpgemmRownnzEstimateNsamples(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_CudaDataSpgemmRownnzEstimateMultFactor(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleSpgemmHashType(hypre_handle)                 hypre_CudaDataSpgemmHashType(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleUmpireDeviceAllocator(hypre_handle)          hypre_CudaDataUmpireDeviceAllocator(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleUseGpuRand(hypre_handle)                     hypre_CudaDataUseGpuRand(hypre_HandleCudaData(hypre_handle))
+#define hypre_HandleDeviceData(hypre_handle)                     ((hypre_handle) -> device_data)
+
+#define hypre_HandleCurandGenerator(hypre_handle)                hypre_DeviceDataCurandGenerator(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCublasHandle(hypre_handle)                   hypre_DeviceDataCublasHandle(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCusparseHandle(hypre_handle)                 hypre_DeviceDataCusparseHandle(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleComputeStream(hypre_handle)                  hypre_DeviceDataComputeStream(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubBinGrowth(hypre_handle)                   hypre_DeviceDataCubBinGrowth(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubMinBin(hypre_handle)                      hypre_DeviceDataCubMinBin(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubMaxBin(hypre_handle)                      hypre_DeviceDataCubMaxBin(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubMaxCachedBytes(hypre_handle)              hypre_DeviceDataCubMaxCachedBytes(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubDevAllocator(hypre_handle)                hypre_DeviceDataCubDevAllocator(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubUvmAllocator(hypre_handle)                hypre_DeviceDataCubUvmAllocator(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleDevice(hypre_handle)                         hypre_DeviceDataDevice(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleComputeStreamNum(hypre_handle)               hypre_DeviceDataComputeStreamNum(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleReduceBuffer(hypre_handle)                   hypre_DeviceDataReduceBuffer(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleStructCommRecvBuffer(hypre_handle)           hypre_DeviceDataStructCommRecvBuffer(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleStructCommSendBuffer(hypre_handle)           hypre_DeviceDataStructCommSendBuffer(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleStructCommRecvBufferSize(hypre_handle)       hypre_DeviceDataStructCommRecvBufferSize(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleStructCommSendBufferSize(hypre_handle)       hypre_DeviceDataStructCommSendBufferSize(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmUseCusparse(hypre_handle)              hypre_DeviceDataSpgemmUseCusparse(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmNumPasses(hypre_handle)                hypre_DeviceDataSpgemmNumPasses(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle)     hypre_DeviceDataSpgemmRownnzEstimateMethod(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle)   hypre_DeviceDataSpgemmRownnzEstimateNsamples(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMultFactor(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmHashType(hypre_handle)                 hypre_DeviceDataSpgemmHashType(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleUmpireDeviceAllocator(hypre_handle)          hypre_DeviceDataUmpireDeviceAllocator(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleUseGpuRand(hypre_handle)                     hypre_DeviceDataUseGpuRand(hypre_HandleDeviceData(hypre_handle))
 
 #define hypre_HandleUmpireResourceMan(hypre_handle)              ((hypre_handle) -> umpire_rm)
 #define hypre_HandleUmpireDevicePoolSize(hypre_handle)           ((hypre_handle) -> umpire_device_pool_size)
@@ -1290,9 +1283,6 @@ typedef struct
 #define hypre_HandleOwnUmpireHostPool(hypre_handle)              ((hypre_handle) -> own_umpire_host_pool)
 #define hypre_HandleOwnUmpirePinnedPool(hypre_handle)            ((hypre_handle) -> own_umpire_pinned_pool)
 
-#define hypre_HandleSyclComputeQueue(hypre_handle)               hypre_SyclDataSyclComputeQueue(hypre_HandleSyclData(hypre_handle))
-#define hypre_HandleSyclDevice(hypre_handle)                     hypre_SyclDataSyclDevice(hypre_HandleSyclData(hypre_handle))
-#define hypre_HandleSyclComputeQueueNum(hypre_handle)            hypre_SyclDataSyclComputeQueueNum(hypre_HandleSyclData(hypre_handle))
 #endif
 /******************************************************************************
  * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index 4d4eed5220..c95eba5773 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -75,6 +75,10 @@ struct hypre_umpire_device_allocator
 
 #if defined(HYPRE_USING_GPU)
 
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ *                          cuda includes
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -100,16 +104,45 @@ struct hypre_umpire_device_allocator
 
 #define CUSPARSE_NEWAPI_VERSION 11000
 
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ *                          hip includes
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
 #elif defined(HYPRE_USING_HIP)
 
 #include <hip/hip_runtime.h>
 
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ *                          sycl includes
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+#elif defined(HYPRE_USING_SYCL)
+
+#include <CL/sycl.hpp>
+/* #include <oneapi/dpl/execution> */
+/* #include <oneapi/dpl/algorithm> */
+/* #include <oneapi/dpl/iterator> */
+/* #include <oneapi/dpl/functional> */
+
+/* #include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if */
+
+/* #include <algorithm> */
+/* #include <numeric> */
+/* #include <functional> */
+/* #include <iterator> */
+
+/* #include <oneapi/mkl.hpp> */
+/* #include <oneapi/mkl/rng/device.hpp> */
+
 #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
 
 #if defined(HYPRE_USING_ROCSPARSE)
 #include <rocsparse.h>
 #endif
 
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ *      macros for wrapping cuda/hip/sycl calls for error reporting
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
 #define HYPRE_CUDA_CALL(call) do {                                                           \
@@ -129,6 +162,25 @@ struct hypre_umpire_device_allocator
       hypre_assert(0); exit(1);                                                              \
    } } while(0)
 
+#elif defined(HYPRE_USING_SYCL)
+#define HYPRE_SYCL_CALL(call)                                                                \
+   try                                                                                       \
+   {                                                                                         \
+      call;                                                                                  \
+   }                                                                                         \
+   catch (sycl::exception const &ex)                                                         \
+   {                                                                                         \
+      hypre_printf("SYCL ERROR (code = %s) at %s:%d\n", ex.what(),                           \
+                     __FILE__, __LINE__);                                                    \
+      assert(0); exit(1);                                                                    \
+   }                                                                                         \
+   catch(std::runtime_error const& ex)                                                       \
+   {                                                                                         \
+      hypre_printf("STD ERROR (code = %s) at %s:%d\n", ex.what(),                            \
+                   __FILE__, __LINE__);                                                      \
+      assert(0); exit(1);                                                                    \
+   }
+
 #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
 
 #define HYPRE_CUBLAS_CALL(call) do {                                                         \
@@ -163,11 +215,12 @@ struct hypre_umpire_device_allocator
       hypre_assert(0); exit(1);                                                              \
    } } while(0)
 
-struct hypre_cub_CachingDeviceAllocator;
-typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator;
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ *      device defined values
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
 
 // HYPRE_WARP_BITSHIFT is just log2 of HYPRE_WARP_SIZE
-#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
+#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) || defined(HYPRE_USING_SYCL)
 #define HYPRE_WARP_SIZE       32
 #define HYPRE_WARP_BITSHIFT   5
 #elif defined(HYPRE_USING_HIP)
@@ -181,7 +234,14 @@ typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator
 #define HYPRE_1D_BLOCK_SIZE   512
 #define HYPRE_MAX_NUM_STREAMS 10
 
-struct hypre_CudaData
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ *      device info data structures
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+struct hypre_cub_CachingDeviceAllocator;
+typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator;
+
+struct hypre_DeviceData
 {
 #if defined(HYPRE_USING_CURAND)
    curandGenerator_t                 curand_generator;
@@ -200,9 +260,11 @@ struct hypre_CudaData
 #endif
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
-   cudaStream_t                      cuda_streams[HYPRE_MAX_NUM_STREAMS];
+   cudaStream_t                      streams[HYPRE_MAX_NUM_STREAMS];
 #elif defined(HYPRE_USING_HIP)
-   hipStream_t                       cuda_streams[HYPRE_MAX_NUM_STREAMS];
+   hipStream_t                       streams[HYPRE_MAX_NUM_STREAMS];
+#elif defined(HYPRE_USING_SYCL)
+   sycl::queue*                      streams[HYPRE_MAX_NUM_STREAMS] = {NULL};
 #endif
 
 #ifdef HYPRE_USING_DEVICE_POOL
@@ -216,12 +278,16 @@ struct hypre_CudaData
 #ifdef HYPRE_USING_UMPIRE_DEVICE
    hypre_umpire_device_allocator     umpire_device_allocator;
 #endif
-   HYPRE_Int                         cuda_device;
+#if defined(HYPRE_USING_SYCL)
+   sycl::device                      device;
+#else
+   HYPRE_Int                         device;
+#endif
    /* by default, hypre puts GPU computations in this stream
-    * Do not be confused with the default (null) CUDA stream */
-   HYPRE_Int                         cuda_compute_stream_num;
-   /* work space for hypre's CUDA reducer */
-   void                             *cuda_reduce_buffer;
+    * Do not be confused with the default (null) stream */
+   HYPRE_Int                         compute_stream_num;
+   /* work space for hypre's device reducer */
+   void                             *reduce_buffer;
    /* the device buffers needed to do MPI communication for struct comm */
    HYPRE_Complex*                    struct_comm_recv_buffer;
    HYPRE_Complex*                    struct_comm_send_buffer;
@@ -238,53 +304,56 @@ struct hypre_CudaData
    HYPRE_Int                         use_gpu_rand;
 };
 
-#define hypre_CudaDataCubBinGrowth(data)                   ((data) -> cub_bin_growth)
-#define hypre_CudaDataCubMinBin(data)                      ((data) -> cub_min_bin)
-#define hypre_CudaDataCubMaxBin(data)                      ((data) -> cub_max_bin)
-#define hypre_CudaDataCubMaxCachedBytes(data)              ((data) -> cub_max_cached_bytes)
-#define hypre_CudaDataCubDevAllocator(data)                ((data) -> cub_dev_allocator)
-#define hypre_CudaDataCubUvmAllocator(data)                ((data) -> cub_uvm_allocator)
-#define hypre_CudaDataCudaDevice(data)                     ((data) -> cuda_device)
-#define hypre_CudaDataCudaComputeStreamNum(data)           ((data) -> cuda_compute_stream_num)
-#define hypre_CudaDataCudaReduceBuffer(data)               ((data) -> cuda_reduce_buffer)
-#define hypre_CudaDataStructCommRecvBuffer(data)           ((data) -> struct_comm_recv_buffer)
-#define hypre_CudaDataStructCommSendBuffer(data)           ((data) -> struct_comm_send_buffer)
-#define hypre_CudaDataStructCommRecvBufferSize(data)       ((data) -> struct_comm_recv_buffer_size)
-#define hypre_CudaDataStructCommSendBufferSize(data)       ((data) -> struct_comm_send_buffer_size)
-#define hypre_CudaDataSpgemmUseCusparse(data)              ((data) -> spgemm_use_cusparse)
-#define hypre_CudaDataSpgemmNumPasses(data)                ((data) -> spgemm_num_passes)
-#define hypre_CudaDataSpgemmRownnzEstimateMethod(data)     ((data) -> spgemm_rownnz_estimate_method)
-#define hypre_CudaDataSpgemmRownnzEstimateNsamples(data)   ((data) -> spgemm_rownnz_estimate_nsamples)
-#define hypre_CudaDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor)
-#define hypre_CudaDataSpgemmHashType(data)                 ((data) -> spgemm_hash_type)
-#define hypre_CudaDataUmpireDeviceAllocator(data)          ((data) -> umpire_device_allocator)
-#define hypre_CudaDataUseGpuRand(data)                     ((data) -> use_gpu_rand)
-
-hypre_CudaData*     hypre_CudaDataCreate();
-void                hypre_CudaDataDestroy(hypre_CudaData* data);
+#define hypre_DeviceDataCubBinGrowth(data)                   ((data) -> cub_bin_growth)
+#define hypre_DeviceDataCubMinBin(data)                      ((data) -> cub_min_bin)
+#define hypre_DeviceDataCubMaxBin(data)                      ((data) -> cub_max_bin)
+#define hypre_DeviceDataCubMaxCachedBytes(data)              ((data) -> cub_max_cached_bytes)
+#define hypre_DeviceDataCubDevAllocator(data)                ((data) -> cub_dev_allocator)
+#define hypre_DeviceDataCubUvmAllocator(data)                ((data) -> cub_uvm_allocator)
+#define hypre_DeviceDataDevice(data)                         ((data) -> device)
+#define hypre_DeviceDataComputeStreamNum(data)               ((data) -> compute_stream_num)
+#define hypre_DeviceDataReduceBuffer(data)                   ((data) -> reduce_buffer)
+#define hypre_DeviceDataStructCommRecvBuffer(data)           ((data) -> struct_comm_recv_buffer)
+#define hypre_DeviceDataStructCommSendBuffer(data)           ((data) -> struct_comm_send_buffer)
+#define hypre_DeviceDataStructCommRecvBufferSize(data)       ((data) -> struct_comm_recv_buffer_size)
+#define hypre_DeviceDataStructCommSendBufferSize(data)       ((data) -> struct_comm_send_buffer_size)
+#define hypre_DeviceDataSpgemmUseCusparse(data)              ((data) -> spgemm_use_cusparse)
+#define hypre_DeviceDataSpgemmNumPasses(data)                ((data) -> spgemm_num_passes)
+#define hypre_DeviceDataSpgemmRownnzEstimateMethod(data)     ((data) -> spgemm_rownnz_estimate_method)
+#define hypre_DeviceDataSpgemmRownnzEstimateNsamples(data)   ((data) -> spgemm_rownnz_estimate_nsamples)
+#define hypre_DeviceDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor)
+#define hypre_DeviceDataSpgemmHashType(data)                 ((data) -> spgemm_hash_type)
+#define hypre_DeviceDataUmpireDeviceAllocator(data)          ((data) -> umpire_device_allocator)
+#define hypre_DeviceDataUseGpuRand(data)                     ((data) -> use_gpu_rand)
+
+hypre_DeviceData*     hypre_DeviceDataCreate();
+void                hypre_DeviceDataDestroy(hypre_DeviceData* data);
 
 #if defined(HYPRE_USING_CURAND)
-curandGenerator_t   hypre_CudaDataCurandGenerator(hypre_CudaData *data);
+curandGenerator_t   hypre_DeviceDataCurandGenerator(hypre_DeviceData *data);
 #endif
 
 #if defined(HYPRE_USING_CUBLAS)
-cublasHandle_t      hypre_CudaDataCublasHandle(hypre_CudaData *data);
+cublasHandle_t      hypre_DeviceDataCublasHandle(hypre_DeviceData *data);
 #endif
 
 #if defined(HYPRE_USING_CUSPARSE)
-cusparseHandle_t    hypre_CudaDataCusparseHandle(hypre_CudaData *data);
+cusparseHandle_t    hypre_DeviceDataCusparseHandle(hypre_DeviceData *data);
 #endif
 
 #if defined(HYPRE_USING_ROCSPARSE)
-rocsparse_handle    hypre_CudaDataCusparseHandle(hypre_CudaData *data);
+rocsparse_handle    hypre_DeviceDataCusparseHandle(hypre_DeviceData *data);
 #endif
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
-cudaStream_t        hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i);
-cudaStream_t        hypre_CudaDataCudaComputeStream(hypre_CudaData *data);
+cudaStream_t        hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i);
+cudaStream_t        hypre_DeviceDataComputeStream(hypre_DeviceData *data);
 #elif defined(HYPRE_USING_HIP)
-hipStream_t         hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i);
-hipStream_t         hypre_CudaDataCudaComputeStream(hypre_CudaData *data);
+hipStream_t         hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i);
+hipStream_t         hypre_DeviceDataComputeStream(hypre_DeviceData *data);
+#elif defined(HYPRE_USING_SYCL)
+sycl::queue*        hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i);
+sycl::queue*        hypre_DeviceDataComputeStream(hypre_DeviceData *data);
 #endif
 
 // Data structure and accessor routines for Cuda Sparse Triangular Matrices
@@ -368,7 +437,7 @@ using namespace thrust::placeholders;
    }                                                                                                                 \
    else                                                                                                              \
    {                                                                                                                 \
-      (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
+      (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
    }                                                                                                                 \
    hypre_SyncCudaComputeStream(hypre_handle());                                                                      \
    HYPRE_CUDA_CALL( cudaGetLastError() );                                                                            \
@@ -385,7 +454,7 @@ using namespace thrust::placeholders;
    }                                                                                                                 \
    else                                                                                                              \
    {                                                                                                                 \
-      (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
+      (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
    }                                                                                                                 \
    hypre_SyncCudaComputeStream(hypre_handle());                                                                      \
    HYPRE_HIP_CALL( hipGetLastError() );                                                                            \
@@ -405,7 +474,7 @@ using namespace thrust::placeholders;
    }                                                                                                                 \
    else                                                                                                              \
    {                                                                                                                 \
-      (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
+      (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
    }                                                                                                                 \
 }
 
@@ -418,26 +487,26 @@ using namespace thrust::placeholders;
 
 #if defined(HYPRE_USING_CUDA)
 #define HYPRE_THRUST_CALL(func_name, ...)                                                                            \
-   thrust::func_name(thrust::cuda::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::cuda::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
 #elif defined(HYPRE_USING_HIP)
 #define HYPRE_THRUST_CALL(func_name, ...)                                                                            \
-   thrust::func_name(thrust::hip::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::hip::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
 #endif // HYPRE_USING_CUDA
 
 #elif HYPRE_USING_DEVICE_POOL
 #if defined(HYPRE_USING_CUDA)
 #define HYPRE_THRUST_CALL(func_name, ...)                                                                            \
-   thrust::func_name(thrust::cuda::par(*(hypre_HandleCubDevAllocator(hypre_handle()))).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::cuda::par(*(hypre_HandleCubDevAllocator(hypre_handle()))).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
 #endif
 
 #else
 
 #if defined(HYPRE_USING_CUDA)
 #define HYPRE_THRUST_CALL(func_name, ...)                                                                            \
-   thrust::func_name(thrust::cuda::par.on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::cuda::par.on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
 #elif defined(HYPRE_USING_HIP)
 #define HYPRE_THRUST_CALL(func_name, ...)                                                                            \
-   thrust::func_name(thrust::hip::par.on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::hip::par.on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
 #endif // HYPRE_USING_CUDA
 
 #endif // HYPRE_USING_UMPIRE_DEVICE
@@ -972,9 +1041,9 @@ cudaError_t hypre_CachingFreeDevice(void *ptr);
 cudaError_t hypre_CachingFreeManaged(void *ptr);
 #endif
 
-hypre_cub_CachingDeviceAllocator * hypre_CudaDataCubCachingAllocatorCreate(hypre_uint bin_growth, hypre_uint min_bin, hypre_uint max_bin, size_t max_cached_bytes, bool skip_cleanup, bool debug, bool use_managed_memory);
+hypre_cub_CachingDeviceAllocator * hypre_DeviceDataCubCachingAllocatorCreate(hypre_uint bin_growth, hypre_uint min_bin, hypre_uint max_bin, size_t max_cached_bytes, bool skip_cleanup, bool debug, bool use_managed_memory);
 
-void hypre_CudaDataCubCachingAllocatorDestroy(hypre_CudaData *data);
+void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
 
 #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
@@ -1200,13 +1269,13 @@ struct ReduceSum
       __thread_sum = 0.0;
       nblocks = -1;
 
-      if (hypre_HandleCudaReduceBuffer(hypre_handle()) == NULL)
+      if (hypre_HandleReduceBuffer(hypre_handle()) == NULL)
       {
          /* allocate for the max size for reducing double6 type */
-         hypre_HandleCudaReduceBuffer(hypre_handle()) = hypre_TAlloc(HYPRE_double6, 1024, HYPRE_MEMORY_DEVICE);
+         hypre_HandleReduceBuffer(hypre_handle()) = hypre_TAlloc(HYPRE_double6, 1024, HYPRE_MEMORY_DEVICE);
       }
 
-      d_buf = (T*) hypre_HandleCudaReduceBuffer(hypre_handle());
+      d_buf = (T*) hypre_HandleReduceBuffer(hypre_handle());
    }
 
    /* copy constructor */
@@ -2111,81 +2180,6 @@ struct hypre_cub_CachingDeviceAllocator
 #endif // #if defined(HYPRE_USING_CUDA) && defined(HYPRE_USING_DEVICE_POOL)
 #endif // #ifndef HYPRE_CUB_ALLOCATOR_HEADER
 
-/******************************************************************************
- * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
- * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
- *
- * SPDX-License-Identifier: (Apache-2.0 OR MIT)
- ******************************************************************************/
-
-#ifndef HYPRE_SYCL_UTILS_HPP
-#define HYPRE_SYCL_UTILS_HPP
-
-#if defined(HYPRE_USING_SYCL)
-
-/* #include <oneapi/dpl/execution> */
-/* #include <oneapi/dpl/algorithm> */
-/* #include <oneapi/dpl/iterator> */
-/* #include <oneapi/dpl/functional> */
-
-/* #include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if */
-
-/* #include <algorithm> */
-/* #include <numeric> */
-/* #include <functional> */
-/* #include <iterator> */
-
-#include <CL/sycl.hpp>
-/* #include <oneapi/mkl.hpp> */
-/* #include <oneapi/mkl/rng/device.hpp> */
-
-#define HYPRE_SYCL_CALL( EXPR )                                       \
-   try                                                                \
-   {                                                                  \
-      EXPR;                                                           \
-   }                                                                  \
-   catch (sycl::exception const &ex)                                  \
-   {                                                                  \
-      hypre_printf("SYCL ERROR (code = %s) at %s:%d\n", ex.what(),    \
-                     __FILE__, __LINE__);                             \
-      assert(0); exit(1);                                             \
-   }                                                                  \
-   catch(std::runtime_error const& ex) {                              \
-      hypre_printf("STD ERROR (code = %s) at %s:%d\n", ex.what(),     \
-                   __FILE__, __LINE__);                               \
-      assert(0); exit(1);                                             \
-   }
-
-// HYPRE_SUBGROUP_BITSHIFT is just log2 of HYPRE_SUBGROUP_SIZE
-#define HYPRE_SUBGROUP_SIZE       32
-#define HYPRE_SUBGROUP_BITSHIFT   5
-#define HYPRE_MAX_NUM_SUBGROUPS   (64 * 64 * 32)
-#define HYPRE_FLT_LARGE           1e30
-#define HYPRE_1D_BLOCK_SIZE       512
-#define HYPRE_MAX_NUM_QUEUES      10
-
-struct hypre_SyclData
-{
-   sycl::queue*                      sycl_queues[HYPRE_MAX_NUM_QUEUES] = {};
-   sycl::device                      sycl_device;
-
-   /* by default, hypre puts GPU computations in this queue
-    * Do not be confused with the default (null) SYCL queue */
-   HYPRE_Int                         sycl_compute_queue_num;
-};
-
-#define hypre_SyclDataSyclDevice(data)                     ((data) -> sycl_device)
-#define hypre_SyclDataSyclComputeQueueNum(data)            ((data) -> sycl_compute_queue_num)
-
-hypre_SyclData* hypre_SyclDataCreate();
-void hypre_SyclDataDestroy(hypre_SyclData* data);
-
-sycl::queue *hypre_SyclDataSyclQueue(hypre_SyclData *data, HYPRE_Int i);
-sycl::queue *hypre_SyclDataSyclComputeQueue(hypre_SyclData *data);
-
-#endif // #if defined(HYPRE_USING_SYCL)
-
-#endif /* #ifndef HYPRE_SYCL_UTILS_HPP */
 
 #ifdef __cplusplus
 }
diff --git a/src/utilities/cuda_reducer.h b/src/utilities/device_reducer.h
similarity index 96%
rename from src/utilities/cuda_reducer.h
rename to src/utilities/device_reducer.h
index d489bb589e..729bbce535 100644
--- a/src/utilities/cuda_reducer.h
+++ b/src/utilities/device_reducer.h
@@ -211,13 +211,13 @@ struct ReduceSum
       __thread_sum = 0.0;
       nblocks = -1;
 
-      if (hypre_HandleCudaReduceBuffer(hypre_handle()) == NULL)
+      if (hypre_HandleReduceBuffer(hypre_handle()) == NULL)
       {
          /* allocate for the max size for reducing double6 type */
-         hypre_HandleCudaReduceBuffer(hypre_handle()) = hypre_TAlloc(HYPRE_double6, 1024, HYPRE_MEMORY_DEVICE);
+         hypre_HandleReduceBuffer(hypre_handle()) = hypre_TAlloc(HYPRE_double6, 1024, HYPRE_MEMORY_DEVICE);
       }
 
-      d_buf = (T*) hypre_HandleCudaReduceBuffer(hypre_handle());
+      d_buf = (T*) hypre_HandleReduceBuffer(hypre_handle());
    }
 
    /* copy constructor */
diff --git a/src/utilities/cuda_utils.c b/src/utilities/device_utils.c
similarity index 90%
rename from src/utilities/cuda_utils.c
rename to src/utilities/device_utils.c
index 4fd055d90e..f9043d0e35 100644
--- a/src/utilities/cuda_utils.c
+++ b/src/utilities/device_utils.c
@@ -35,7 +35,7 @@ void hypre_CudaCompileFlagCheck()
   // This is really only defined for CUDA and not for HIP
 #if defined(HYPRE_USING_CUDA)
 
-   HYPRE_Int device = hypre_HandleCudaDevice(hypre_handle());
+   HYPRE_Int device = hypre_HandleDevice(hypre_handle());
 
    struct cudaDeviceProp props;
    cudaGetDeviceProperties(&props, device);
@@ -852,8 +852,10 @@ cudaStream_t
 cudaStream_t
 #elif defined(HYPRE_USING_HIP)
 hipStream_t
+#elif defined(HYPRE_USING_SYCL)
+sycl::queue*
 #endif
-hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i)
+hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i)
 {
 #if defined(HYPRE_USING_DEVICE_OPENMP)
    cudaStream_t stream = 0;
@@ -861,6 +863,41 @@ hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i)
    cudaStream_t stream = 0;
 #elif defined(HYPRE_USING_HIP)
    hipStream_t stream = 0;
+#elif defined(HYPRE_USING_SYCL)
+   sycl::queue   *stream = NULL;
+   if (i >= HYPRE_MAX_NUM_STREAMS)
+   {
+      hypre_printf("SYCL queue %d exceeds the max number %d\n",
+                   i, HYPRE_MAX_NUM_STREAMS);
+      return NULL;
+   }
+   if (data->streams[i])
+   {
+      return data->streams[i];
+   }
+   else
+   {
+      auto sycl_asynchandler = [] (sycl::exception_list exceptions) 
+      {
+         for (std::exception_ptr const& e : exceptions) 
+         {
+            try
+            {
+               std::rethrow_exception(e);
+            }
+            catch (sycl::exception const& ex)
+            {
+               std::cout << "Caught asynchronous SYCL exception:" << std::endl
+               << ex.what() << ", OpenCL code: " << ex.get_cl_code() << std::endl;
+            }
+         }
+      };
+
+      sycl::device   syclDev   = data->device;
+      sycl::context  syclctxt  = sycl::context(syclDev, sycl_asynchandler);
+      stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}});
+      data->streams[i] = stream;
+   }
 #endif
 
 #if defined(HYPRE_USING_CUDA_STREAMS)
@@ -874,9 +911,9 @@ hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i)
       return NULL;
    }
 
-   if (data->cuda_streams[i])
+   if (data->streams[i])
    {
-      return data->cuda_streams[i];
+      return data->streams[i];
    }
 
 #if defined(HYPRE_USING_DEVICE_OPENMP)
@@ -888,7 +925,7 @@ hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i)
    HYPRE_HIP_CALL(hipStreamCreateWithFlags(&stream, hipStreamDefault));
 #endif
 
-   data->cuda_streams[i] = stream;
+   data->streams[i] = stream;
 #endif
 
    return stream;
@@ -900,16 +937,18 @@ cudaStream_t
 cudaStream_t
 #elif defined(HYPRE_USING_HIP)
 hipStream_t
+#elif defined(HYPRE_USING_SYCL)
+sycl::queue*
 #endif
-hypre_CudaDataCudaComputeStream(hypre_CudaData *data)
+hypre_DeviceDataComputeStream(hypre_DeviceData *data)
 {
-   return hypre_CudaDataCudaStream(data,
-                                   hypre_CudaDataCudaComputeStreamNum(data));
+   return hypre_DeviceDataStream(data,
+                                   hypre_DeviceDataComputeStreamNum(data));
 }
 
 #if defined(HYPRE_USING_CURAND)
 curandGenerator_t
-hypre_CudaDataCurandGenerator(hypre_CudaData *data)
+hypre_DeviceDataCurandGenerator(hypre_DeviceData *data)
 {
    if (data->curand_generator)
    {
@@ -919,7 +958,7 @@ hypre_CudaDataCurandGenerator(hypre_CudaData *data)
    curandGenerator_t gen;
    HYPRE_CURAND_CALL( curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT) );
    HYPRE_CURAND_CALL( curandSetPseudoRandomGeneratorSeed(gen, 1234ULL) );
-   HYPRE_CURAND_CALL( curandSetStream(gen, hypre_CudaDataCudaComputeStream(data)) );
+   HYPRE_CURAND_CALL( curandSetStream(gen, hypre_DeviceDataComputeStream(data)) );
 
    data->curand_generator = gen;
 
@@ -1005,7 +1044,7 @@ hypre_CurandUniformSingle( HYPRE_Int          n,
 
 #if defined(HYPRE_USING_CUBLAS)
 cublasHandle_t
-hypre_CudaDataCublasHandle(hypre_CudaData *data)
+hypre_DeviceDataCublasHandle(hypre_DeviceData *data)
 {
    if (data->cublas_handle)
    {
@@ -1015,7 +1054,7 @@ hypre_CudaDataCublasHandle(hypre_CudaData *data)
    cublasHandle_t handle;
    HYPRE_CUBLAS_CALL( cublasCreate(&handle) );
 
-   HYPRE_CUBLAS_CALL( cublasSetStream(handle, hypre_CudaDataCudaComputeStream(data)) );
+   HYPRE_CUBLAS_CALL( cublasSetStream(handle, hypre_DeviceDataComputeStream(data)) );
 
    data->cublas_handle = handle;
 
@@ -1025,7 +1064,7 @@ hypre_CudaDataCublasHandle(hypre_CudaData *data)
 
 #if defined(HYPRE_USING_CUSPARSE)
 cusparseHandle_t
-hypre_CudaDataCusparseHandle(hypre_CudaData *data)
+hypre_DeviceDataCusparseHandle(hypre_DeviceData *data)
 {
    if (data->cusparse_handle)
    {
@@ -1035,7 +1074,7 @@ hypre_CudaDataCusparseHandle(hypre_CudaData *data)
    cusparseHandle_t handle;
    HYPRE_CUSPARSE_CALL( cusparseCreate(&handle) );
 
-   HYPRE_CUSPARSE_CALL( cusparseSetStream(handle, hypre_CudaDataCudaComputeStream(data)) );
+   HYPRE_CUSPARSE_CALL( cusparseSetStream(handle, hypre_DeviceDataComputeStream(data)) );
 
    data->cusparse_handle = handle;
 
@@ -1046,7 +1085,7 @@ hypre_CudaDataCusparseHandle(hypre_CudaData *data)
 
 #if defined(HYPRE_USING_ROCSPARSE)
 rocsparse_handle
-hypre_CudaDataCusparseHandle(hypre_CudaData *data)
+hypre_DeviceDataCusparseHandle(hypre_DeviceData *data)
 {
    if (data->cusparse_handle)
    {
@@ -1056,7 +1095,7 @@ hypre_CudaDataCusparseHandle(hypre_CudaData *data)
    rocsparse_handle handle;
    HYPRE_ROCSPARSE_CALL( rocsparse_create_handle(&handle) );
 
-   HYPRE_ROCSPARSE_CALL( rocsparse_set_stream(handle, hypre_CudaDataCudaComputeStream(data)) );
+   HYPRE_ROCSPARSE_CALL( rocsparse_set_stream(handle, hypre_DeviceDataComputeStream(data)) );
 
    data->cusparse_handle = handle;
 
@@ -1066,58 +1105,62 @@ hypre_CudaDataCusparseHandle(hypre_CudaData *data)
 
 
 
-hypre_CudaData*
-hypre_CudaDataCreate()
+hypre_DeviceData*
+hypre_DeviceDataCreate()
 {
-   hypre_CudaData *data = hypre_CTAlloc(hypre_CudaData, 1, HYPRE_MEMORY_HOST);
+   hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST);
 
-   hypre_CudaDataCudaDevice(data)            = 0;
-   hypre_CudaDataCudaComputeStreamNum(data)  = 0;
+#if defined(HYPRE_USING_SYCL)
+   hypre_DeviceDataDevice(data)            = sycl::device(sycl::gpu_selector{});
+#else
+   hypre_DeviceDataDevice(data)            = 0;
+#endif
+   hypre_DeviceDataComputeStreamNum(data)  = 0;
 
    /* SpGeMM */
 #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
-   hypre_CudaDataSpgemmUseCusparse(data) = 1;
+   hypre_DeviceDataSpgemmUseCusparse(data) = 1;
 #else
-   hypre_CudaDataSpgemmUseCusparse(data) = 0;
+   hypre_DeviceDataSpgemmUseCusparse(data) = 0;
 #endif
-   hypre_CudaDataSpgemmNumPasses(data) = 3;
+   hypre_DeviceDataSpgemmNumPasses(data) = 3;
    /* 1: naive overestimate, 2: naive underestimate, 3: Cohen's algorithm */
-   hypre_CudaDataSpgemmRownnzEstimateMethod(data) = 3;
-   hypre_CudaDataSpgemmRownnzEstimateNsamples(data) = 32;
-   hypre_CudaDataSpgemmRownnzEstimateMultFactor(data) = 1.5;
-   hypre_CudaDataSpgemmHashType(data) = 'L';
+   hypre_DeviceDataSpgemmRownnzEstimateMethod(data) = 3;
+   hypre_DeviceDataSpgemmRownnzEstimateNsamples(data) = 32;
+   hypre_DeviceDataSpgemmRownnzEstimateMultFactor(data) = 1.5;
+   hypre_DeviceDataSpgemmHashType(data) = 'L';
 
    /* pmis */
 #ifdef HYPRE_USING_CURAND
-   hypre_CudaDataUseGpuRand(data) = 1;
+   hypre_DeviceDataUseGpuRand(data) = 1;
 #else
-   hypre_CudaDataUseGpuRand(data) = 0;
+   hypre_DeviceDataUseGpuRand(data) = 0;
 #endif
 
    /* device pool */
 #ifdef HYPRE_USING_DEVICE_POOL
-   hypre_CudaDataCubBinGrowth(data)      = 8u;
-   hypre_CudaDataCubMinBin(data)         = 1u;
-   hypre_CudaDataCubMaxBin(data)         = (hypre_uint) -1;
-   hypre_CudaDataCubMaxCachedBytes(data) = (size_t) -1;
-   hypre_CudaDataCubDevAllocator(data)   = NULL;
-   hypre_CudaDataCubUvmAllocator(data)   = NULL;
+   hypre_DeviceDataCubBinGrowth(data)      = 8u;
+   hypre_DeviceDataCubMinBin(data)         = 1u;
+   hypre_DeviceDataCubMaxBin(data)         = (hypre_uint) -1;
+   hypre_DeviceDataCubMaxCachedBytes(data) = (size_t) -1;
+   hypre_DeviceDataCubDevAllocator(data)   = NULL;
+   hypre_DeviceDataCubUvmAllocator(data)   = NULL;
 #endif
 
    return data;
 }
 
 void
-hypre_CudaDataDestroy(hypre_CudaData *data)
+hypre_DeviceDataDestroy(hypre_DeviceData *data)
 {
    if (!data)
    {
       return;
    }
 
-   hypre_TFree(hypre_CudaDataCudaReduceBuffer(data),     HYPRE_MEMORY_DEVICE);
-   hypre_TFree(hypre_CudaDataStructCommRecvBuffer(data), HYPRE_MEMORY_DEVICE);
-   hypre_TFree(hypre_CudaDataStructCommSendBuffer(data), HYPRE_MEMORY_DEVICE);
+   hypre_TFree(hypre_DeviceDataReduceBuffer(data),         HYPRE_MEMORY_DEVICE);
+   hypre_TFree(hypre_DeviceDataStructCommRecvBuffer(data), HYPRE_MEMORY_DEVICE);
+   hypre_TFree(hypre_DeviceDataStructCommSendBuffer(data), HYPRE_MEMORY_DEVICE);
 
 #if defined(HYPRE_USING_CURAND)
    if (data->curand_generator)
@@ -1146,20 +1189,23 @@ hypre_CudaDataDestroy(hypre_CudaData *data)
 
    for (HYPRE_Int i = 0; i < HYPRE_MAX_NUM_STREAMS; i++)
    {
-      if (data->cuda_streams[i])
+      if (data->streams[i])
       {
 #if defined(HYPRE_USING_DEVICE_OPENMP)
-         HYPRE_CUDA_CALL( cudaStreamDestroy(data->cuda_streams[i]) );
+         HYPRE_CUDA_CALL( cudaStreamDestroy(data->streams[i]) );
 #elif defined(HYPRE_USING_CUDA)
-         HYPRE_CUDA_CALL( cudaStreamDestroy(data->cuda_streams[i]) );
+         HYPRE_CUDA_CALL( cudaStreamDestroy(data->streams[i]) );
 #elif defined(HYPRE_USING_HIP)
-         HYPRE_HIP_CALL( hipStreamDestroy(data->cuda_streams[i]) );
+         HYPRE_HIP_CALL( hipStreamDestroy(data->streams[i]) );
+#elif defined(HYPRE_USING_SYCL)
+         delete data->streams[i];
+         data->streams[i] = nullptr;
 #endif
       }
    }
 
 #ifdef HYPRE_USING_DEVICE_POOL
-   hypre_CudaDataCubCachingAllocatorDestroy(data);
+   hypre_DeviceDataCubCachingAllocatorDestroy(data);
 #endif
 
    hypre_TFree(data, HYPRE_MEMORY_HOST);
@@ -1222,9 +1268,9 @@ hypre_SyncCudaComputeStream_core(HYPRE_Int     action,
          if (cuda_compute_stream_sync)
          {
 #if defined(HYPRE_USING_CUDA)
-            HYPRE_CUDA_CALL( cudaStreamSynchronize(hypre_HandleCudaComputeStream(hypre_handle)) );
+            HYPRE_CUDA_CALL( cudaStreamSynchronize(hypre_HandleComputeStream(hypre_handle)) );
 #elif defined(HYPRE_USING_HIP)
-            HYPRE_HIP_CALL( hipStreamSynchronize(hypre_HandleCudaComputeStream(hypre_handle)) );
+            HYPRE_HIP_CALL( hipStreamSynchronize(hypre_HandleComputeStream(hypre_handle)) );
 #endif
          }
 #endif
diff --git a/src/utilities/cuda_utils.h b/src/utilities/device_utils.h
similarity index 79%
rename from src/utilities/cuda_utils.h
rename to src/utilities/device_utils.h
index 4394a892aa..7d4030cd5e 100644
--- a/src/utilities/cuda_utils.h
+++ b/src/utilities/device_utils.h
@@ -10,6 +10,10 @@
 
 #if defined(HYPRE_USING_GPU)
 
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ *                          cuda includes
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -35,16 +39,45 @@
 
 #define CUSPARSE_NEWAPI_VERSION 11000
 
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ *                          hip includes
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
 #elif defined(HYPRE_USING_HIP)
 
 #include <hip/hip_runtime.h>
 
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ *                          sycl includes
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+#elif defined(HYPRE_USING_SYCL)
+
+#include <CL/sycl.hpp>
+/* #include <oneapi/dpl/execution> */
+/* #include <oneapi/dpl/algorithm> */
+/* #include <oneapi/dpl/iterator> */
+/* #include <oneapi/dpl/functional> */
+
+/* #include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if */
+
+/* #include <algorithm> */
+/* #include <numeric> */
+/* #include <functional> */
+/* #include <iterator> */
+
+/* #include <oneapi/mkl.hpp> */
+/* #include <oneapi/mkl/rng/device.hpp> */
+
 #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
 
 #if defined(HYPRE_USING_ROCSPARSE)
 #include <rocsparse.h>
 #endif
 
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ *      macros for wrapping cuda/hip/sycl calls for error reporting
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
 #define HYPRE_CUDA_CALL(call) do {                                                           \
@@ -64,6 +97,25 @@
       hypre_assert(0); exit(1);                                                              \
    } } while(0)
 
+#elif defined(HYPRE_USING_SYCL)
+#define HYPRE_SYCL_CALL(call)                                                                \
+   try                                                                                       \
+   {                                                                                         \
+      call;                                                                                  \
+   }                                                                                         \
+   catch (sycl::exception const &ex)                                                         \
+   {                                                                                         \
+      hypre_printf("SYCL ERROR (code = %s) at %s:%d\n", ex.what(),                           \
+                     __FILE__, __LINE__);                                                    \
+      assert(0); exit(1);                                                                    \
+   }                                                                                         \
+   catch(std::runtime_error const& ex)                                                       \
+   {                                                                                         \
+      hypre_printf("STD ERROR (code = %s) at %s:%d\n", ex.what(),                            \
+                   __FILE__, __LINE__);                                                      \
+      assert(0); exit(1);                                                                    \
+   }
+
 #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
 
 #define HYPRE_CUBLAS_CALL(call) do {                                                         \
@@ -98,11 +150,12 @@
       hypre_assert(0); exit(1);                                                              \
    } } while(0)
 
-struct hypre_cub_CachingDeviceAllocator;
-typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator;
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ *      device defined values
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
 
 // HYPRE_WARP_BITSHIFT is just log2 of HYPRE_WARP_SIZE
-#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
+#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) || defined(HYPRE_USING_SYCL)
 #define HYPRE_WARP_SIZE       32
 #define HYPRE_WARP_BITSHIFT   5
 #elif defined(HYPRE_USING_HIP)
@@ -116,7 +169,14 @@ typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator
 #define HYPRE_1D_BLOCK_SIZE   512
 #define HYPRE_MAX_NUM_STREAMS 10
 
-struct hypre_CudaData
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ *      device info data structures
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
+
+struct hypre_cub_CachingDeviceAllocator;
+typedef struct hypre_cub_CachingDeviceAllocator hypre_cub_CachingDeviceAllocator;
+
+struct hypre_DeviceData
 {
 #if defined(HYPRE_USING_CURAND)
    curandGenerator_t                 curand_generator;
@@ -135,9 +195,11 @@ struct hypre_CudaData
 #endif
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
-   cudaStream_t                      cuda_streams[HYPRE_MAX_NUM_STREAMS];
+   cudaStream_t                      streams[HYPRE_MAX_NUM_STREAMS];
 #elif defined(HYPRE_USING_HIP)
-   hipStream_t                       cuda_streams[HYPRE_MAX_NUM_STREAMS];
+   hipStream_t                       streams[HYPRE_MAX_NUM_STREAMS];
+#elif defined(HYPRE_USING_SYCL)
+   sycl::queue*                      streams[HYPRE_MAX_NUM_STREAMS] = {NULL};
 #endif
 
 #ifdef HYPRE_USING_DEVICE_POOL
@@ -151,12 +213,16 @@ struct hypre_CudaData
 #ifdef HYPRE_USING_UMPIRE_DEVICE
    hypre_umpire_device_allocator     umpire_device_allocator;
 #endif
-   HYPRE_Int                         cuda_device;
+#if defined(HYPRE_USING_SYCL)
+   sycl::device                      device;
+#else
+   HYPRE_Int                         device;
+#endif
    /* by default, hypre puts GPU computations in this stream
-    * Do not be confused with the default (null) CUDA stream */
-   HYPRE_Int                         cuda_compute_stream_num;
-   /* work space for hypre's CUDA reducer */
-   void                             *cuda_reduce_buffer;
+    * Do not be confused with the default (null) stream */
+   HYPRE_Int                         compute_stream_num;
+   /* work space for hypre's device reducer */
+   void                             *reduce_buffer;
    /* the device buffers needed to do MPI communication for struct comm */
    HYPRE_Complex*                    struct_comm_recv_buffer;
    HYPRE_Complex*                    struct_comm_send_buffer;
@@ -173,53 +239,56 @@ struct hypre_CudaData
    HYPRE_Int                         use_gpu_rand;
 };
 
-#define hypre_CudaDataCubBinGrowth(data)                   ((data) -> cub_bin_growth)
-#define hypre_CudaDataCubMinBin(data)                      ((data) -> cub_min_bin)
-#define hypre_CudaDataCubMaxBin(data)                      ((data) -> cub_max_bin)
-#define hypre_CudaDataCubMaxCachedBytes(data)              ((data) -> cub_max_cached_bytes)
-#define hypre_CudaDataCubDevAllocator(data)                ((data) -> cub_dev_allocator)
-#define hypre_CudaDataCubUvmAllocator(data)                ((data) -> cub_uvm_allocator)
-#define hypre_CudaDataCudaDevice(data)                     ((data) -> cuda_device)
-#define hypre_CudaDataCudaComputeStreamNum(data)           ((data) -> cuda_compute_stream_num)
-#define hypre_CudaDataCudaReduceBuffer(data)               ((data) -> cuda_reduce_buffer)
-#define hypre_CudaDataStructCommRecvBuffer(data)           ((data) -> struct_comm_recv_buffer)
-#define hypre_CudaDataStructCommSendBuffer(data)           ((data) -> struct_comm_send_buffer)
-#define hypre_CudaDataStructCommRecvBufferSize(data)       ((data) -> struct_comm_recv_buffer_size)
-#define hypre_CudaDataStructCommSendBufferSize(data)       ((data) -> struct_comm_send_buffer_size)
-#define hypre_CudaDataSpgemmUseCusparse(data)              ((data) -> spgemm_use_cusparse)
-#define hypre_CudaDataSpgemmNumPasses(data)                ((data) -> spgemm_num_passes)
-#define hypre_CudaDataSpgemmRownnzEstimateMethod(data)     ((data) -> spgemm_rownnz_estimate_method)
-#define hypre_CudaDataSpgemmRownnzEstimateNsamples(data)   ((data) -> spgemm_rownnz_estimate_nsamples)
-#define hypre_CudaDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor)
-#define hypre_CudaDataSpgemmHashType(data)                 ((data) -> spgemm_hash_type)
-#define hypre_CudaDataUmpireDeviceAllocator(data)          ((data) -> umpire_device_allocator)
-#define hypre_CudaDataUseGpuRand(data)                     ((data) -> use_gpu_rand)
-
-hypre_CudaData*     hypre_CudaDataCreate();
-void                hypre_CudaDataDestroy(hypre_CudaData* data);
+#define hypre_DeviceDataCubBinGrowth(data)                   ((data) -> cub_bin_growth)
+#define hypre_DeviceDataCubMinBin(data)                      ((data) -> cub_min_bin)
+#define hypre_DeviceDataCubMaxBin(data)                      ((data) -> cub_max_bin)
+#define hypre_DeviceDataCubMaxCachedBytes(data)              ((data) -> cub_max_cached_bytes)
+#define hypre_DeviceDataCubDevAllocator(data)                ((data) -> cub_dev_allocator)
+#define hypre_DeviceDataCubUvmAllocator(data)                ((data) -> cub_uvm_allocator)
+#define hypre_DeviceDataDevice(data)                         ((data) -> device)
+#define hypre_DeviceDataComputeStreamNum(data)               ((data) -> compute_stream_num)
+#define hypre_DeviceDataReduceBuffer(data)                   ((data) -> reduce_buffer)
+#define hypre_DeviceDataStructCommRecvBuffer(data)           ((data) -> struct_comm_recv_buffer)
+#define hypre_DeviceDataStructCommSendBuffer(data)           ((data) -> struct_comm_send_buffer)
+#define hypre_DeviceDataStructCommRecvBufferSize(data)       ((data) -> struct_comm_recv_buffer_size)
+#define hypre_DeviceDataStructCommSendBufferSize(data)       ((data) -> struct_comm_send_buffer_size)
+#define hypre_DeviceDataSpgemmUseCusparse(data)              ((data) -> spgemm_use_cusparse)
+#define hypre_DeviceDataSpgemmNumPasses(data)                ((data) -> spgemm_num_passes)
+#define hypre_DeviceDataSpgemmRownnzEstimateMethod(data)     ((data) -> spgemm_rownnz_estimate_method)
+#define hypre_DeviceDataSpgemmRownnzEstimateNsamples(data)   ((data) -> spgemm_rownnz_estimate_nsamples)
+#define hypre_DeviceDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor)
+#define hypre_DeviceDataSpgemmHashType(data)                 ((data) -> spgemm_hash_type)
+#define hypre_DeviceDataUmpireDeviceAllocator(data)          ((data) -> umpire_device_allocator)
+#define hypre_DeviceDataUseGpuRand(data)                     ((data) -> use_gpu_rand)
+
+hypre_DeviceData*     hypre_DeviceDataCreate();
+void                hypre_DeviceDataDestroy(hypre_DeviceData* data);
 
 #if defined(HYPRE_USING_CURAND)
-curandGenerator_t   hypre_CudaDataCurandGenerator(hypre_CudaData *data);
+curandGenerator_t   hypre_DeviceDataCurandGenerator(hypre_DeviceData *data);
 #endif
 
 #if defined(HYPRE_USING_CUBLAS)
-cublasHandle_t      hypre_CudaDataCublasHandle(hypre_CudaData *data);
+cublasHandle_t      hypre_DeviceDataCublasHandle(hypre_DeviceData *data);
 #endif
 
 #if defined(HYPRE_USING_CUSPARSE)
-cusparseHandle_t    hypre_CudaDataCusparseHandle(hypre_CudaData *data);
+cusparseHandle_t    hypre_DeviceDataCusparseHandle(hypre_DeviceData *data);
 #endif
 
 #if defined(HYPRE_USING_ROCSPARSE)
-rocsparse_handle    hypre_CudaDataCusparseHandle(hypre_CudaData *data);
+rocsparse_handle    hypre_DeviceDataCusparseHandle(hypre_DeviceData *data);
 #endif
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
-cudaStream_t        hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i);
-cudaStream_t        hypre_CudaDataCudaComputeStream(hypre_CudaData *data);
+cudaStream_t        hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i);
+cudaStream_t        hypre_DeviceDataComputeStream(hypre_DeviceData *data);
 #elif defined(HYPRE_USING_HIP)
-hipStream_t         hypre_CudaDataCudaStream(hypre_CudaData *data, HYPRE_Int i);
-hipStream_t         hypre_CudaDataCudaComputeStream(hypre_CudaData *data);
+hipStream_t         hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i);
+hipStream_t         hypre_DeviceDataComputeStream(hypre_DeviceData *data);
+#elif defined(HYPRE_USING_SYCL)
+sycl::queue*        hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i);
+sycl::queue*        hypre_DeviceDataComputeStream(hypre_DeviceData *data);
 #endif
 
 // Data structure and accessor routines for Cuda Sparse Triangular Matrices
@@ -303,7 +372,7 @@ using namespace thrust::placeholders;
    }                                                                                                                 \
    else                                                                                                              \
    {                                                                                                                 \
-      (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
+      (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
    }                                                                                                                 \
    hypre_SyncCudaComputeStream(hypre_handle());                                                                      \
    HYPRE_CUDA_CALL( cudaGetLastError() );                                                                            \
@@ -320,7 +389,7 @@ using namespace thrust::placeholders;
    }                                                                                                                 \
    else                                                                                                              \
    {                                                                                                                 \
-      (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
+      (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
    }                                                                                                                 \
    hypre_SyncCudaComputeStream(hypre_handle());                                                                      \
    HYPRE_HIP_CALL( hipGetLastError() );                                                                            \
@@ -340,7 +409,7 @@ using namespace thrust::placeholders;
    }                                                                                                                 \
    else                                                                                                              \
    {                                                                                                                 \
-      (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
+      (kernel_name) <<< (gridsize), (blocksize), 0, hypre_HandleComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
    }                                                                                                                 \
 }
 
@@ -353,26 +422,26 @@ using namespace thrust::placeholders;
 
 #if defined(HYPRE_USING_CUDA)
 #define HYPRE_THRUST_CALL(func_name, ...)                                                                            \
-   thrust::func_name(thrust::cuda::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::cuda::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
 #elif defined(HYPRE_USING_HIP)
 #define HYPRE_THRUST_CALL(func_name, ...)                                                                            \
-   thrust::func_name(thrust::hip::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::hip::par(hypre_HandleUmpireDeviceAllocator(hypre_handle())).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
 #endif // HYPRE_USING_CUDA
 
 #elif HYPRE_USING_DEVICE_POOL
 #if defined(HYPRE_USING_CUDA)
 #define HYPRE_THRUST_CALL(func_name, ...)                                                                            \
-   thrust::func_name(thrust::cuda::par(*(hypre_HandleCubDevAllocator(hypre_handle()))).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::cuda::par(*(hypre_HandleCubDevAllocator(hypre_handle()))).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
 #endif
 
 #else
 
 #if defined(HYPRE_USING_CUDA)
 #define HYPRE_THRUST_CALL(func_name, ...)                                                                            \
-   thrust::func_name(thrust::cuda::par.on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::cuda::par.on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
 #elif defined(HYPRE_USING_HIP)
 #define HYPRE_THRUST_CALL(func_name, ...)                                                                            \
-   thrust::func_name(thrust::hip::par.on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::hip::par.on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
 #endif // HYPRE_USING_CUDA
 
 #endif // HYPRE_USING_UMPIRE_DEVICE
@@ -907,9 +976,9 @@ cudaError_t hypre_CachingFreeDevice(void *ptr);
 cudaError_t hypre_CachingFreeManaged(void *ptr);
 #endif
 
-hypre_cub_CachingDeviceAllocator * hypre_CudaDataCubCachingAllocatorCreate(hypre_uint bin_growth, hypre_uint min_bin, hypre_uint max_bin, size_t max_cached_bytes, bool skip_cleanup, bool debug, bool use_managed_memory);
+hypre_cub_CachingDeviceAllocator * hypre_DeviceDataCubCachingAllocatorCreate(hypre_uint bin_growth, hypre_uint min_bin, hypre_uint max_bin, size_t max_cached_bytes, bool skip_cleanup, bool debug, bool use_managed_memory);
 
-void hypre_CudaDataCubCachingAllocatorDestroy(hypre_CudaData *data);
+void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
 
 #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
diff --git a/src/utilities/general.c b/src/utilities/general.c
index 11d747afad..9bafe21224 100644
--- a/src/utilities/general.c
+++ b/src/utilities/general.c
@@ -51,13 +51,14 @@ hypre_HandleCreate()
 #if defined(HYPRE_USING_GPU)
    hypre_HandleDefaultExecPolicy(hypre_handle_) = HYPRE_EXEC_DEVICE;
    hypre_HandleStructExecPolicy(hypre_handle_) = HYPRE_EXEC_DEVICE;
-   hypre_HandleCudaData(hypre_handle_) = hypre_CudaDataCreate();
+   hypre_HandleDeviceData(hypre_handle_) = hypre_DeviceDataCreate();
 #endif
 
+// WM: temporarily set the default exec policy to host for sycl until more functionality is available
 #if defined(HYPRE_USING_SYCL)
    hypre_HandleDefaultExecPolicy(hypre_handle_) = HYPRE_EXEC_HOST;
    hypre_HandleStructExecPolicy(hypre_handle_) = HYPRE_EXEC_HOST;
-   hypre_HandleSyclData(hypre_handle_) = hypre_SyclDataCreate();
+   hypre_HandleDeviceData(hypre_handle_) = hypre_DeviceDataCreate();
 #endif
 
    return hypre_handle_;
@@ -72,11 +73,7 @@ hypre_HandleDestroy(hypre_Handle *hypre_handle_)
    }
 
 #if defined(HYPRE_USING_GPU)
-   hypre_CudaDataDestroy(hypre_HandleCudaData(hypre_handle_));
-#endif
-
-#if defined(HYPRE_USING_SYCL)
-   hypre_SyclDataDestroy(hypre_HandleSyclData(hypre_handle_));
+   hypre_DeviceDataDestroy(hypre_HandleDeviceData(hypre_handle_));
 #endif
 
    hypre_TFree(hypre_handle_, HYPRE_MEMORY_HOST);
@@ -101,78 +98,19 @@ hypre_SetDevice(hypre_int device_id, hypre_Handle *hypre_handle_)
    HYPRE_HIP_CALL( hipSetDevice(device_id) );
 #endif
 
-#if defined(HYPRE_USING_GPU)
-   if (hypre_handle_)
-   {
-      hypre_HandleCudaDevice(hypre_handle_) = device_id;
-   }
-#endif
-
 #if defined(HYPRE_USING_SYCL)
-   // WM: TODO - this ain't it...
-   hypre_int nDevices=0;
-   sycl::platform platform(sycl::gpu_selector{});
-   auto const& gpu_devices = platform.get_devices();
-   for (int i = 0; i < gpu_devices.size(); i++)
-   {
-      if (gpu_devices[i].is_gpu())
-      {
-         if(gpu_devices[i].get_info<sycl::info::device::partition_max_sub_devices>() > 0)
-         {
-	         auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(
-	                                     sycl::info::partition_affinity_domain::numa);
-	         nDevices += subDevicesDomainNuma.size();
-         }
-         else
-         {
-	         nDevices++;
-         }
-      }
-   }
-
-   if (device_id > nDevices)
+   /* sycl device set at construction of hypre_DeviceData object */
+#elif defined(HYPRE_USING_GPU)
+   if (hypre_handle_)
    {
-      // WM: debug
-      hypre_printf("device_id = %d, nDevices = %d\n", device_id, nDevices);
-      hypre_printf("ERROR: SYCL device-ID exceed the number of devices on-node... \n");
-   }
-
-   HYPRE_Int local_nDevices=0;
-   for (int i = 0; i < gpu_devices.size(); i++)
-   {
-      if (gpu_devices[i].is_gpu())
-      {
-         // multi-tile GPUs
-         if (gpu_devices[i].get_info<sycl::info::device::partition_max_sub_devices>() > 0)
-         {
-            auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(
-                                        sycl::info::partition_affinity_domain::numa);
-            for (const auto &tile : subDevicesDomainNuma)
-            {
-               if (local_nDevices == device_id)
-               {
-                  hypre_HandleSyclDevice(hypre_handle_) = tile;
-               }
-               local_nDevices++;
-            }
-         }
-         // single-tile GPUs
-         else
-         {
-            if (local_nDevices == device_id)
-            {
-               hypre_HandleSyclDevice(hypre_handle_) = gpu_devices[i];
-            }
-            local_nDevices++;
-         }
-      }
+      hypre_HandleDevice(hypre_handle_) = device_id;
    }
 #endif
 
    return hypre_error_flag;
 }
 
-/* Note: it doesn't return device_id in hypre_Handle->hypre_CudaData,
+/* Note: it doesn't return device_id in hypre_Handle->hypre_DeviceData,
  *       calls API instead. But these two should match at all times
  */
 HYPRE_Int
@@ -191,7 +129,7 @@ hypre_GetDevice(hypre_int *device_id)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   // WM: TODO
+   /* sycl device set at construction of hypre_DeviceData object */
 #endif
 
    return hypre_error_flag;
@@ -253,7 +191,15 @@ hypre_GetDeviceLastError()
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   // WM: TODO
+   try
+   {
+      hypre_HandleComputeStream(hypre_handle())->wait_and_throw();
+   }
+   catch (sycl::exception const& e)
+   {
+      std::cout << "Caught synchronous SYCL exception:\n"
+                << e.what() << std::endl;
+   }
 #endif
 
    return hypre_error_flag;
@@ -280,7 +226,7 @@ HYPRE_Init()
       _hypre_handle = hypre_HandleCreate();
    }
 
-#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL)
+#if defined(HYPRE_USING_GPU)
    hypre_GetDeviceLastError();
 
    /* Notice: the cudaStream created is specific to the device
@@ -293,12 +239,7 @@ HYPRE_Init()
 
    /* To include the cost of creating streams/cudahandles in HYPRE_Init */
    /* If not here, will be done at the first use */
-#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
-   hypre_HandleCudaComputeStream(_hypre_handle);
-#endif
-#if defined(HYPRE_USING_SYCL)
-   hypre_HandleSyclComputeQueue(_hypre_handle);
-#endif
+   hypre_HandleComputeStream(_hypre_handle);
 
    /* A separate stream for prefetching */
    //hypre_HandleCudaPrefetchStream(_hypre_handle);
diff --git a/src/utilities/handle.h b/src/utilities/handle.h
index 2c2bccfcc8..3e4915fc01 100644
--- a/src/utilities/handle.h
+++ b/src/utilities/handle.h
@@ -14,11 +14,8 @@
 #ifndef HYPRE_HANDLE_H
 #define HYPRE_HANDLE_H
 
-struct hypre_CudaData;
-typedef struct hypre_CudaData hypre_CudaData;
-
-struct hypre_SyclData;
-typedef struct hypre_SyclData hypre_SyclData;
+struct hypre_DeviceData;
+typedef struct hypre_DeviceData hypre_DeviceData;
 
 typedef struct
 {
@@ -27,7 +24,7 @@ typedef struct
    HYPRE_ExecutionPolicy  default_exec_policy;
    HYPRE_ExecutionPolicy  struct_exec_policy;
 #if defined(HYPRE_USING_GPU)
-   hypre_CudaData        *cuda_data;
+   hypre_DeviceData        *device_data;
 #endif
 #if defined(HYPRE_USING_UMPIRE)
    char                   umpire_device_pool_name[HYPRE_UMPIRE_POOL_NAME_MAX_LEN];
@@ -45,43 +42,39 @@ typedef struct
    HYPRE_Int              own_umpire_pinned_pool;
    umpire_resourcemanager umpire_rm;
 #endif
-#if defined(HYPRE_USING_SYCL)
-   hypre_SyclData        *sycl_data;
-#endif
 } hypre_Handle;
 
 /* accessor macros to hypre_Handle */
 #define hypre_HandleMemoryLocation(hypre_handle)                 ((hypre_handle) -> memory_location)
 #define hypre_HandleDefaultExecPolicy(hypre_handle)              ((hypre_handle) -> default_exec_policy)
 #define hypre_HandleStructExecPolicy(hypre_handle)               ((hypre_handle) -> struct_exec_policy)
-#define hypre_HandleCudaData(hypre_handle)                       ((hypre_handle) -> cuda_data)
-#define hypre_HandleSyclData(hypre_handle)                       ((hypre_handle) -> sycl_data)
+#define hypre_HandleDeviceData(hypre_handle)                     ((hypre_handle) -> device_data)
 
-#define hypre_HandleCurandGenerator(hypre_handle)                hypre_CudaDataCurandGenerator(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCublasHandle(hypre_handle)                   hypre_CudaDataCublasHandle(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCusparseHandle(hypre_handle)                 hypre_CudaDataCusparseHandle(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCudaComputeStream(hypre_handle)              hypre_CudaDataCudaComputeStream(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCubBinGrowth(hypre_handle)                   hypre_CudaDataCubBinGrowth(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCubMinBin(hypre_handle)                      hypre_CudaDataCubMinBin(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCubMaxBin(hypre_handle)                      hypre_CudaDataCubMaxBin(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCubMaxCachedBytes(hypre_handle)              hypre_CudaDataCubMaxCachedBytes(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCubDevAllocator(hypre_handle)                hypre_CudaDataCubDevAllocator(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCubUvmAllocator(hypre_handle)                hypre_CudaDataCubUvmAllocator(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCudaDevice(hypre_handle)                     hypre_CudaDataCudaDevice(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCudaComputeStreamNum(hypre_handle)           hypre_CudaDataCudaComputeStreamNum(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleCudaReduceBuffer(hypre_handle)               hypre_CudaDataCudaReduceBuffer(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleStructCommRecvBuffer(hypre_handle)           hypre_CudaDataStructCommRecvBuffer(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleStructCommSendBuffer(hypre_handle)           hypre_CudaDataStructCommSendBuffer(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleStructCommRecvBufferSize(hypre_handle)       hypre_CudaDataStructCommRecvBufferSize(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleStructCommSendBufferSize(hypre_handle)       hypre_CudaDataStructCommSendBufferSize(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleSpgemmUseCusparse(hypre_handle)              hypre_CudaDataSpgemmUseCusparse(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleSpgemmNumPasses(hypre_handle)                hypre_CudaDataSpgemmNumPasses(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle)     hypre_CudaDataSpgemmRownnzEstimateMethod(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle)   hypre_CudaDataSpgemmRownnzEstimateNsamples(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_CudaDataSpgemmRownnzEstimateMultFactor(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleSpgemmHashType(hypre_handle)                 hypre_CudaDataSpgemmHashType(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleUmpireDeviceAllocator(hypre_handle)          hypre_CudaDataUmpireDeviceAllocator(hypre_HandleCudaData(hypre_handle))
-#define hypre_HandleUseGpuRand(hypre_handle)                     hypre_CudaDataUseGpuRand(hypre_HandleCudaData(hypre_handle))
+#define hypre_HandleCurandGenerator(hypre_handle)                hypre_DeviceDataCurandGenerator(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCublasHandle(hypre_handle)                   hypre_DeviceDataCublasHandle(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCusparseHandle(hypre_handle)                 hypre_DeviceDataCusparseHandle(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleComputeStream(hypre_handle)                  hypre_DeviceDataComputeStream(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubBinGrowth(hypre_handle)                   hypre_DeviceDataCubBinGrowth(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubMinBin(hypre_handle)                      hypre_DeviceDataCubMinBin(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubMaxBin(hypre_handle)                      hypre_DeviceDataCubMaxBin(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubMaxCachedBytes(hypre_handle)              hypre_DeviceDataCubMaxCachedBytes(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubDevAllocator(hypre_handle)                hypre_DeviceDataCubDevAllocator(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleCubUvmAllocator(hypre_handle)                hypre_DeviceDataCubUvmAllocator(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleDevice(hypre_handle)                         hypre_DeviceDataDevice(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleComputeStreamNum(hypre_handle)               hypre_DeviceDataComputeStreamNum(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleReduceBuffer(hypre_handle)                   hypre_DeviceDataReduceBuffer(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleStructCommRecvBuffer(hypre_handle)           hypre_DeviceDataStructCommRecvBuffer(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleStructCommSendBuffer(hypre_handle)           hypre_DeviceDataStructCommSendBuffer(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleStructCommRecvBufferSize(hypre_handle)       hypre_DeviceDataStructCommRecvBufferSize(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleStructCommSendBufferSize(hypre_handle)       hypre_DeviceDataStructCommSendBufferSize(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmUseCusparse(hypre_handle)              hypre_DeviceDataSpgemmUseCusparse(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmNumPasses(hypre_handle)                hypre_DeviceDataSpgemmNumPasses(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle)     hypre_DeviceDataSpgemmRownnzEstimateMethod(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle)   hypre_DeviceDataSpgemmRownnzEstimateNsamples(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMultFactor(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmHashType(hypre_handle)                 hypre_DeviceDataSpgemmHashType(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleUmpireDeviceAllocator(hypre_handle)          hypre_DeviceDataUmpireDeviceAllocator(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleUseGpuRand(hypre_handle)                     hypre_DeviceDataUseGpuRand(hypre_HandleDeviceData(hypre_handle))
 
 #define hypre_HandleUmpireResourceMan(hypre_handle)              ((hypre_handle) -> umpire_rm)
 #define hypre_HandleUmpireDevicePoolSize(hypre_handle)           ((hypre_handle) -> umpire_device_pool_size)
@@ -98,7 +91,4 @@ typedef struct
 #define hypre_HandleOwnUmpireHostPool(hypre_handle)              ((hypre_handle) -> own_umpire_host_pool)
 #define hypre_HandleOwnUmpirePinnedPool(hypre_handle)            ((hypre_handle) -> own_umpire_pinned_pool)
 
-#define hypre_HandleSyclComputeQueue(hypre_handle)               hypre_SyclDataSyclComputeQueue(hypre_HandleSyclData(hypre_handle))
-#define hypre_HandleSyclDevice(hypre_handle)                     hypre_SyclDataSyclDevice(hypre_HandleSyclData(hypre_handle))
-#define hypre_HandleSyclComputeQueueNum(hypre_handle)            hypre_SyclDataSyclComputeQueueNum(hypre_HandleSyclData(hypre_handle))
 #endif
diff --git a/src/utilities/headers b/src/utilities/headers
index 0c96b33fae..81f1301471 100755
--- a/src/utilities/headers
+++ b/src/utilities/headers
@@ -89,10 +89,9 @@ extern "C++" {
 #===========================================================================
 
 cat umpire_allocator.h     >> $INTERNAL_HEADER
-cat cuda_utils.h           >> $INTERNAL_HEADER
-cat cuda_reducer.h         >> $INTERNAL_HEADER
+cat device_utils.h         >> $INTERNAL_HEADER
+cat device_reducer.h       >> $INTERNAL_HEADER
 cat cub_allocator.h        >> $INTERNAL_HEADER
-cat sycl_utils.h           >> $INTERNAL_HEADER
 
 #===========================================================================
 # Include guards
diff --git a/src/utilities/memory.c b/src/utilities/memory.c
index 5dc5af7ea8..dfb8a2e939 100644
--- a/src/utilities/memory.c
+++ b/src/utilities/memory.c
@@ -78,7 +78,7 @@ hypre_DeviceMemset(void *ptr, HYPRE_Int value, size_t num)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   (hypre_HandleSyclComputeQueue(hypre_handle()))->memset(ptr, value, num).wait();
+   (hypre_HandleComputeStream(hypre_handle()))->memset(ptr, value, num).wait();
 #endif
 }
 
@@ -99,7 +99,7 @@ hypre_UnifiedMemset(void *ptr, HYPRE_Int value, size_t num)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   (hypre_HandleSyclComputeQueue(hypre_handle()))->memset(ptr, value, num).wait();
+   (hypre_HandleComputeStream(hypre_handle()))->memset(ptr, value, num).wait();
 #endif
 }
 
@@ -122,26 +122,26 @@ hypre_UnifiedMemPrefetch(void *ptr, size_t size, hypre_MemoryLocation location)
 #if defined(HYPRE_USING_DEVICE_OPENMP)
    if (location == hypre_MEMORY_DEVICE)
    {
-      HYPRE_CUDA_CALL( cudaMemPrefetchAsync(ptr, size, hypre_HandleCudaDevice(hypre_handle()),
-                       hypre_HandleCudaComputeStream(hypre_handle())) );
+      HYPRE_CUDA_CALL( cudaMemPrefetchAsync(ptr, size, hypre_HandleDevice(hypre_handle()),
+                       hypre_HandleComputeStream(hypre_handle())) );
    }
    else if (location == hypre_MEMORY_HOST)
    {
       HYPRE_CUDA_CALL( cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId,
-                       hypre_HandleCudaComputeStream(hypre_handle())) );
+                       hypre_HandleComputeStream(hypre_handle())) );
    }
 #endif
 
 #if defined(HYPRE_USING_CUDA)
    if (location == hypre_MEMORY_DEVICE)
    {
-      HYPRE_CUDA_CALL( cudaMemPrefetchAsync(ptr, size, hypre_HandleCudaDevice(hypre_handle()),
-                       hypre_HandleCudaComputeStream(hypre_handle())) );
+      HYPRE_CUDA_CALL( cudaMemPrefetchAsync(ptr, size, hypre_HandleDevice(hypre_handle()),
+                       hypre_HandleComputeStream(hypre_handle())) );
    }
    else if (location == hypre_MEMORY_HOST)
    {
       HYPRE_CUDA_CALL( cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId,
-                       hypre_HandleCudaComputeStream(hypre_handle())) );
+                       hypre_HandleComputeStream(hypre_handle())) );
    }
 #endif
 
@@ -150,13 +150,13 @@ hypre_UnifiedMemPrefetch(void *ptr, size_t size, hypre_MemoryLocation location)
    /*
     *if (location == hypre_MEMORY_DEVICE)
     *{
-    *  HYPRE_HIP_CALL( hipMemPrefetchAsync(ptr, size, hypre_HandleCudaDevice(hypre_handle()),
-    *                   hypre_HandleCudaComputeStream(hypre_handle())) );
+    *  HYPRE_HIP_CALL( hipMemPrefetchAsync(ptr, size, hypre_HandleDevice(hypre_handle()),
+    *                   hypre_HandleComputeStream(hypre_handle())) );
     *}
     *else if (location == hypre_MEMORY_HOST)
     *{
     *   HYPRE_CUDA_CALL( hipMemPrefetchAsync(ptr, size, cudaCpuDeviceId,
-    *                    hypre_HandleCudaComputeStream(hypre_handle())) );
+    *                    hypre_HandleComputeStream(hypre_handle())) );
     *}
     */
 #endif
@@ -228,7 +228,7 @@ hypre_DeviceMalloc(size_t size, HYPRE_Int zeroinit)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   ptr = (void *)sycl::malloc_device(size, *(hypre_HandleSyclComputeQueue(hypre_handle())));
+   ptr = (void *)sycl::malloc_device(size, *(hypre_HandleComputeStream(hypre_handle())));
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_DEVICE) */
@@ -267,7 +267,7 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleSyclComputeQueue(hypre_handle())));
+   ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleComputeStream(hypre_handle())));
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */
@@ -308,7 +308,7 @@ hypre_HostPinnedMalloc(size_t size, HYPRE_Int zeroinit)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   ptr = (void *)sycl::malloc_host(size, *(hypre_HandleSyclComputeQueue(hypre_handle())));
+   ptr = (void *)sycl::malloc_host(size, *(hypre_HandleComputeStream(hypre_handle())));
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_PINNED) */
@@ -405,7 +405,7 @@ hypre_DeviceFree(void *ptr)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   sycl::free(ptr, *(hypre_HandleSyclComputeQueue(hypre_handle())));
+   sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle())));
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_DEVICE) */
@@ -435,7 +435,7 @@ hypre_UnifiedFree(void *ptr)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   sycl::free(ptr, *(hypre_HandleSyclComputeQueue(hypre_handle())));
+   sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle())));
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */
@@ -461,7 +461,7 @@ hypre_HostPinnedFree(void *ptr)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   sycl::free(ptr, *(hypre_HandleSyclComputeQueue(hypre_handle())));
+   sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle())));
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_PINNED) */
@@ -516,7 +516,7 @@ static inline void
 hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_dst, hypre_MemoryLocation loc_src)
 {
 #if defined(HYPRE_USING_SYCL)
-   sycl::queue* q = hypre_HandleSyclComputeQueue(hypre_handle());
+   sycl::queue* q = hypre_HandleComputeStream(hypre_handle());
 #endif
 
    if (dst == NULL || src == NULL)
@@ -718,7 +718,7 @@ hypre_GetExecPolicy1_core(hypre_MemoryLocation location)
          exec = HYPRE_EXEC_DEVICE;
          break;
       case hypre_MEMORY_UNIFIED :
-#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL)
+#if defined(HYPRE_USING_GPU)
          exec = hypre_HandleDefaultExecPolicy(hypre_handle());
 #endif
          break;
@@ -765,7 +765,7 @@ hypre_GetExecPolicy2_core(hypre_MemoryLocation location1,
 
    if (location1 == hypre_MEMORY_UNIFIED && location2 == hypre_MEMORY_UNIFIED)
    {
-#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL)
+#if defined(HYPRE_USING_GPU)
       exec = hypre_HandleDefaultExecPolicy(hypre_handle());
 #endif
    }
@@ -971,7 +971,7 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
 {
    HYPRE_Int ierr = 0;
 
-#if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL)
+#if defined(HYPRE_USING_GPU)
    *memory_location = hypre_MEMORY_UNDEFINED;
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
@@ -1069,7 +1069,7 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
 #if defined(HYPRE_USING_SYCL)
    *memory_location = hypre_MEMORY_UNDEFINED;
    sycl::usm::alloc allocType;
-   allocType = sycl::get_pointer_type(ptr, (hypre_HandleSyclComputeQueue(hypre_handle()))->get_context());
+   allocType = sycl::get_pointer_type(ptr, (hypre_HandleComputeStream(hypre_handle()))->get_context());
 
    if (allocType == sycl::usm::alloc::unknown)
    {
@@ -1089,7 +1089,7 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
    }
 #endif //HYPRE_USING_SYCL
 
-#else /* #if defined(HYPRE_USING_GPU) || defined(HYPRE_USING_SYCL) */
+#else /* #if defined(HYPRE_USING_GPU) */
    *memory_location = hypre_MEMORY_HOST;
 #endif
 
@@ -1390,7 +1390,7 @@ hypre_CachingMallocDevice(void **ptr, size_t nbytes)
    if (!hypre_HandleCubDevAllocator(hypre_handle()))
    {
       hypre_HandleCubDevAllocator(hypre_handle()) =
-         hypre_CudaDataCubCachingAllocatorCreate( hypre_HandleCubBinGrowth(hypre_handle()),
+         hypre_DeviceDataCubCachingAllocatorCreate( hypre_HandleCubBinGrowth(hypre_handle()),
                                                   hypre_HandleCubMinBin(hypre_handle()),
                                                   hypre_HandleCubMaxBin(hypre_handle()),
                                                   hypre_HandleCubMaxCachedBytes(hypre_handle()),
@@ -1414,7 +1414,7 @@ hypre_CachingMallocManaged(void **ptr, size_t nbytes)
    if (!hypre_HandleCubUvmAllocator(hypre_handle()))
    {
       hypre_HandleCubUvmAllocator(hypre_handle()) =
-         hypre_CudaDataCubCachingAllocatorCreate( hypre_HandleCubBinGrowth(hypre_handle()),
+         hypre_DeviceDataCubCachingAllocatorCreate( hypre_HandleCubBinGrowth(hypre_handle()),
                                                   hypre_HandleCubMinBin(hypre_handle()),
                                                   hypre_HandleCubMaxBin(hypre_handle()),
                                                   hypre_HandleCubMaxCachedBytes(hypre_handle()),
@@ -1433,7 +1433,7 @@ hypre_CachingFreeManaged(void *ptr)
 }
 
 hypre_cub_CachingDeviceAllocator *
-hypre_CudaDataCubCachingAllocatorCreate(hypre_uint bin_growth,
+hypre_DeviceDataCubCachingAllocatorCreate(hypre_uint bin_growth,
                                         hypre_uint min_bin,
                                         hypre_uint max_bin,
                                         size_t     max_cached_bytes,
@@ -1454,10 +1454,10 @@ hypre_CudaDataCubCachingAllocatorCreate(hypre_uint bin_growth,
 }
 
 void
-hypre_CudaDataCubCachingAllocatorDestroy(hypre_CudaData *data)
+hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data)
 {
-   delete hypre_CudaDataCubDevAllocator(data);
-   delete hypre_CudaDataCubUvmAllocator(data);
+   delete hypre_DeviceDataCubDevAllocator(data);
+   delete hypre_DeviceDataCubUvmAllocator(data);
 }
 
 #endif // #ifdef HYPRE_USING_DEVICE_POOL
@@ -1532,7 +1532,7 @@ HYPRE_Int
 hypre_umpire_device_pooled_allocate(void **ptr, size_t nbytes)
 {
    hypre_Handle *handle = hypre_handle();
-   const hypre_int device_id = hypre_HandleCudaDevice(handle);
+   const hypre_int device_id = hypre_HandleDevice(handle);
    char resource_name[16];
    const char *pool_name = hypre_HandleUmpireDevicePoolName(handle);
 

From bafa6c2890f746695ebb45940f3544051ba50368 Mon Sep 17 00:00:00 2001
From: Wayne Bradford Mitchell <mitchell82@llnl.gov>
Date: Tue, 27 Jul 2021 14:38:14 -0700
Subject: [PATCH 03/44] Fix cuda compilation

Quick fix for compilation --with-cuda. Ran some tests on lassen
and quartz as well to make sure I didn't break the cuda or cpu
versions.
---
 src/seq_mv/csr_spgemm_device.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/seq_mv/csr_spgemm_device.c b/src/seq_mv/csr_spgemm_device.c
index ca7dff4256..60d871a5ac 100644
--- a/src/seq_mv/csr_spgemm_device.c
+++ b/src/seq_mv/csr_spgemm_device.c
@@ -116,7 +116,7 @@ hypre_CSRMatrixDeviceSpGemmSetRownnzEstimateMethod( HYPRE_Int value )
 {
    if (value == 1 || value == 2 || value == 3)
    {
-      hypre_HandleCudaData(hypre_handle())->spgemm_rownnz_estimate_method = value;
+      hypre_HandleDeviceData(hypre_handle())->spgemm_rownnz_estimate_method = value;
    }
    else
    {
@@ -129,7 +129,7 @@ hypre_CSRMatrixDeviceSpGemmSetRownnzEstimateMethod( HYPRE_Int value )
 HYPRE_Int
 hypre_CSRMatrixDeviceSpGemmSetRownnzEstimateNSamples( HYPRE_Int value )
 {
-   hypre_HandleCudaData(hypre_handle())->spgemm_rownnz_estimate_nsamples = value;
+   hypre_HandleDeviceData(hypre_handle())->spgemm_rownnz_estimate_nsamples = value;
 
    return 0;
 }
@@ -139,7 +139,7 @@ hypre_CSRMatrixDeviceSpGemmSetRownnzEstimateMultFactor( HYPRE_Real value )
 {
    if (value > 0.0)
    {
-      hypre_HandleCudaData(hypre_handle())->spgemm_rownnz_estimate_mult_factor = value;
+      hypre_HandleDeviceData(hypre_handle())->spgemm_rownnz_estimate_mult_factor = value;
    }
    else
    {
@@ -154,7 +154,7 @@ hypre_CSRMatrixDeviceSpGemmSetHashType( char value )
 {
    if (value == 'L' || value == 'Q' || value == 'D')
    {
-      hypre_HandleCudaData(hypre_handle())->spgemm_hash_type = value;
+      hypre_HandleDeviceData(hypre_handle())->spgemm_hash_type = value;
    }
    else
    {

From c16315d685307f5e0908c19a0944cfcf68e2922d Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <wbm@jlselogin6.ftm.alcf.anl.gov>
Date: Wed, 28 Jul 2021 23:39:07 +0000
Subject: [PATCH 04/44] Choose default exec policy for matvec

Modified csr matvec to choose the default execution policy
instead of hard-coded device policy. This now passes tests
and seems to run as expected using sycl unified memory and
using host execution for everything.
---
 src/seq_mv/csr_matvec.c       |   8 +-
 src/test/Makefile             |   1 +
 src/test/TEST_ij/solvers.jobs | 165 +++++++++++++++++-----------------
 src/utilities/general.c       |  13 ++-
 src/utilities/memory.c        |  26 +++---
 5 files changed, 105 insertions(+), 108 deletions(-)

diff --git a/src/seq_mv/csr_matvec.c b/src/seq_mv/csr_matvec.c
index 38d2f1d244..90f57d44da 100644
--- a/src/seq_mv/csr_matvec.c
+++ b/src/seq_mv/csr_matvec.c
@@ -711,9 +711,7 @@ hypre_CSRMatrixMatvecOutOfPlace( HYPRE_Complex    alpha,
    HYPRE_Int ierr = 0;
 
 #if defined(HYPRE_USING_GPU)
-   //HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
-   //RL: TODO back to hypre_GetExecPolicy1 later
-   HYPRE_ExecutionPolicy exec = HYPRE_EXEC_DEVICE;
+   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_CSRMatrixMemoryLocation(A) );
    if (exec == HYPRE_EXEC_DEVICE)
    {
       ierr = hypre_CSRMatrixMatvecDevice(0, alpha, A, x, beta, b, y, offset);
@@ -981,9 +979,7 @@ hypre_CSRMatrixMatvecT( HYPRE_Complex    alpha,
    HYPRE_Int ierr = 0;
 
 #if defined(HYPRE_USING_GPU)
-   //HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
-   //RL: TODO back to hypre_GetExecPolicy1 later
-   HYPRE_ExecutionPolicy exec = HYPRE_EXEC_DEVICE;
+   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_CSRMatrixMemoryLocation(A) );
    if (exec == HYPRE_EXEC_DEVICE)
    {
       ierr = hypre_CSRMatrixMatvecDevice(1, alpha, A, x, beta, y, y, 0 );
diff --git a/src/test/Makefile b/src/test/Makefile
index 10c3ac32cf..5a4c606193 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -144,6 +144,7 @@ ij: ij.${OBJ_SUFFIX}
 	@echo  "Building" $@ "... "
 	${LINK_CC} -o $@ $< ${LFLAGS}
 
+# WM: TODO: remove
 simple: simple.${OBJ_SUFFIX}
 	@echo  "Building" $@ "... "
 	${LINK_CC} -o $@ $< ${LFLAGS}
diff --git a/src/test/TEST_ij/solvers.jobs b/src/test/TEST_ij/solvers.jobs
index 11dfba0f37..1f1c68c56c 100755
--- a/src/test/TEST_ij/solvers.jobs
+++ b/src/test/TEST_ij/solvers.jobs
@@ -29,63 +29,64 @@
 #        60: DS_FlexGMRES
 #
 #=============================================================================
+# WM: TODO remove -exec_host
 
-mpirun -np 2 ./ij -solver 1 -rhsrand > solvers.out.0
-mpirun -np 2 ./ij -solver 2 -rhsrand > solvers.out.1
-mpirun -np 2 ./ij -solver 3 -rhsrand > solvers.out.2
-mpirun -np 2 ./ij -solver 4 -rhsrand > solvers.out.3
-mpirun -np 2 ./ij -solver 5 -rhsrand -w 0.67 -ns 2 > solvers.out.4
-mpirun -np 2 ./ij -solver 6 -rhsrand > solvers.out.5
-#mpirun -np 2 ./ij -solver 7 -rhsrand > solvers.out.6
-#mpirun -np 2 ./ij -solver 8 -rhsrand > solvers.out.7
-mpirun -np 2 ./ij -solver 20 -rhsrand > solvers.out.8
-mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand > solvers.out.9
-mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand -solver_type 2 > solvers.out.10
-mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand -solver_type 3 > solvers.out.11
-mpirun -np 2 ./ij -solver 16 -rhsrand > solvers.out.12
-mpirun -np 2 ./ij -solver 17 -rhsrand > solvers.out.13
-mpirun -np 2 ./ij -solver 17 -rhsrand -cgs 2 > solvers.out.14
-mpirun -np 2 ./ij -solver 17 -rhsrand -cgs 2 -unroll 8 > solvers.out.15
-mpirun -np 2 ./ij -solver 17 -rhsrand -unroll 4 > solvers.out.16
-mpirun -np 2 ./ij -solver 3 -rhsrand -check_residual > solvers.out.17
-mpirun -np 2 ./ij -solver 4 -rhsrand -check_residual > solvers.out.18
+mpirun -np 2 ./ij -exec_host -solver 1 -rhsrand > solvers.out.0
+mpirun -np 2 ./ij -exec_host -solver 2 -rhsrand > solvers.out.1
+mpirun -np 2 ./ij -exec_host -solver 3 -rhsrand > solvers.out.2
+mpirun -np 2 ./ij -exec_host -solver 4 -rhsrand > solvers.out.3
+mpirun -np 2 ./ij -exec_host -solver 5 -rhsrand -w 0.67 -ns 2 > solvers.out.4
+mpirun -np 2 ./ij -exec_host -solver 6 -rhsrand > solvers.out.5
+#mpirun -np 2 ./ij -exec_host -solver 7 -rhsrand > solvers.out.6
+#mpirun -np 2 ./ij -exec_host -solver 8 -rhsrand > solvers.out.7
+mpirun -np 2 ./ij -exec_host -solver 20 -rhsrand > solvers.out.8
+mpirun -np 2 ./ij -exec_host -solver 20 -cf 0.5 -rhsrand > solvers.out.9
+mpirun -np 2 ./ij -exec_host -solver 20 -cf 0.5 -rhsrand -solver_type 2 > solvers.out.10
+mpirun -np 2 ./ij -exec_host -solver 20 -cf 0.5 -rhsrand -solver_type 3 > solvers.out.11
+mpirun -np 2 ./ij -exec_host -solver 16 -rhsrand > solvers.out.12
+mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand > solvers.out.13
+mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand -cgs 2 > solvers.out.14
+mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand -cgs 2 -unroll 8 > solvers.out.15
+mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand -unroll 4 > solvers.out.16
+mpirun -np 2 ./ij -exec_host -solver 3 -rhsrand -check_residual > solvers.out.17
+mpirun -np 2 ./ij -exec_host -solver 4 -rhsrand -check_residual > solvers.out.18
 
 #systems AMG run ...unknown approach, hybrid approach, nodal approach
-mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 > solvers.out.sysu
-mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 > solvers.out.sysh
-mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 > solvers.out.sysn
+mpirun -np 2 ./ij -exec_host -n 20 20 20 -sysL 2 -nf 2 > solvers.out.sysu
+mpirun -np 2 ./ij -exec_host -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 > solvers.out.sysh
+mpirun -np 2 ./ij -exec_host -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 > solvers.out.sysn
 
 #LGMRS and FlexGMRES
-mpirun -np 2 ./ij -solver 50 -rhsrand > solvers.out.101
-mpirun -np 2 ./ij -solver 51 -rhsrand > solvers.out.102
-mpirun -np 2 ./ij -solver 60 -rhsrand > solvers.out.103
-mpirun -np 2 ./ij -solver 61 -rhsrand > solvers.out.104
+mpirun -np 2 ./ij -exec_host -solver 50 -rhsrand > solvers.out.101
+mpirun -np 2 ./ij -exec_host -solver 51 -rhsrand > solvers.out.102
+mpirun -np 2 ./ij -exec_host -solver 60 -rhsrand > solvers.out.103
+mpirun -np 2 ./ij -exec_host -solver 61 -rhsrand > solvers.out.104
 
 #agglomerated coarse grid solve
-mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 > solvers.out.105
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 > solvers.out.107
+mpirun -np 8 ./ij -exec_host -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 > solvers.out.105
+mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 > solvers.out.107
 
 #redundant coarse grid solve
-mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 > solvers.out.106
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 > solvers.out.108
+mpirun -np 8 ./ij -exec_host -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 > solvers.out.106
+mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 > solvers.out.108
 
 #additive cycles
-mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 > solvers.out.109
-mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add_w 0.7 -mult_add 0 > solvers.out.110
-mpirun -np 4 ./ij -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 > solvers.out.111
-mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 > solvers.out.112
-mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 > solvers.out.113
-mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -add_end 2 > solvers.out.118
-mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 ns 2 > solvers.out.119
-mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -rlx 18 -ns 2 -rlx_coarse 18 -ns_coarse 2 > solvers.out.120
+mpirun -np 2 ./ij -exec_host -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 > solvers.out.109
+mpirun -np 2 ./ij -exec_host -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -add_rlx 0 -add_w 0.7 -mult_add 0 > solvers.out.110
+mpirun -np 4 ./ij -exec_host -n 20 20 20 -P 2 2 1 -agg_nl 1 -solver 1 -simple 0 > solvers.out.111
+mpirun -np 8 ./ij -exec_host -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -additive 1 > solvers.out.112
+mpirun -np 8 ./ij -exec_host -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 > solvers.out.113
+mpirun -np 8 ./ij -exec_host -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -add_Pmx 5 -add_end 2 > solvers.out.118
+mpirun -np 8 ./ij -exec_host -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 ns 2 > solvers.out.119
+mpirun -np 8 ./ij -exec_host -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -rlx 18 -ns 2 -rlx_coarse 18 -ns_coarse 2 > solvers.out.120
 
 #nonGalerkin version
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 > solvers.out.115
+mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114
+mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 > solvers.out.115
 
 #RAP options
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 > solvers.out.116
-mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117
+mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 > solvers.out.116
+mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117
 
 #
 # MGR and MGR-PCG
@@ -93,26 +94,26 @@ mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117
 # coarse grid solver checks (1-level MGR == AMG (or coarse grid solver))
 # Also checks for keeping coarse nodes to coarsest level
 # coarse grid size in output should be ~ mgr_num_reserved_nodes
-mpirun -np 2 ./ij -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 0 > solvers.out.200
-mpirun -np 2 ./ij -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 100 > solvers.out.201
-mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 0 > solvers.out.202
-mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 100 > solvers.out.203
+mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 0 > solvers.out.200
+mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 100 > solvers.out.201
+mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 0 > solvers.out.202
+mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 100 > solvers.out.203
 # multi level MGR tests with different coarse grid type strategies
 # Fix non C points to F points with different F-relaxation methods (single/multilevel F-relaxation)
 # with/ without reserved coarse nodes
-mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.204
-mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.205
-mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.206
-mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.207
+mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.204
+mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.205
+mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.206
+mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.207
 # Not fixed non C points to F points with different F-relaxation methods (single/multilevel F-relaxation)
 # with/ without reserved coarse nodes
-mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.208
-mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.209
-mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.210
-mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.211
+mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.208
+mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.209
+mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.210
+mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.211
 # MGR-PCG tests
-mpirun -np 2 ./ij -solver 71 -mgr_nlevels 0 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.212
-mpirun -np 2 ./ij -solver 71 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.213
+mpirun -np 2 ./ij -exec_host -solver 71 -mgr_nlevels 0 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.212
+mpirun -np 2 ./ij -exec_host -solver 71 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.213
 
 #
 # hypre_ILU tests
@@ -121,39 +122,39 @@ mpirun -np 2 ./ij -solver 71 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_
 # Tests ILU-(Flex)GMRES
 # Test AMG with ILU as a complex smoother
 #
-mpirun -np 1  ./ij -solver 80 -ilu_type 0 -ilu_lfil 0 > solvers.out.300
-mpirun -np 1  ./ij -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.301
-mpirun -np 1  ./ij -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.302
+mpirun -np 1  ./ij -exec_host -solver 80 -ilu_type 0 -ilu_lfil 0 > solvers.out.300
+mpirun -np 1  ./ij -exec_host -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.301
+mpirun -np 1  ./ij -exec_host -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.302
 # parallel ILU
 # BJ
-mpirun -np 2  ./ij -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.303
-mpirun -np 2  ./ij -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000  > solvers.out.304
+mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.303
+mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000  > solvers.out.304
 # GMRES+ILU
-mpirun -np 2  ./ij -solver 80 -ilu_type 10 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.305
-mpirun -np 2  ./ij -solver 80 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.306
+mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 10 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.305
+mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.306
 # NSH+ILU
-mpirun -np 2  ./ij -solver 80 -ilu_type 20 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.307
-mpirun -np 2  ./ij -solver 80 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.308
+mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 20 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.307
+mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.308
 # RAS+ILU
-mpirun -np 2  ./ij -solver 80 -ilu_type 30 -ilu_lfil 1 > solvers.out.309
-mpirun -np 2  ./ij -solver 80 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.310
+mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 30 -ilu_lfil 1 > solvers.out.309
+mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.310
 # ddPQ-GMRES+ILU
-mpirun -np 2  ./ij -solver 80 -ilu_type 40 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.311
-mpirun -np 2  ./ij -solver 80 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.312
+mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 40 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.311
+mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.312
 ##  ILU-GMRES
-mpirun -np 2  ./ij -solver 81 -ilu_type 0 -ilu_lfil 0 > solvers.out.313
-mpirun -np 2  ./ij -solver 81 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000  > solvers.out.314
-mpirun -np 2  ./ij -solver 81 -ilu_type 30 -ilu_lfil 0 > solvers.out.315
-mpirun -np 2  ./ij -solver 81 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000  > solvers.out.316
+mpirun -np 2  ./ij -exec_host -solver 81 -ilu_type 0 -ilu_lfil 0 > solvers.out.313
+mpirun -np 2  ./ij -exec_host -solver 81 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000  > solvers.out.314
+mpirun -np 2  ./ij -exec_host -solver 81 -ilu_type 30 -ilu_lfil 0 > solvers.out.315
+mpirun -np 2  ./ij -exec_host -solver 81 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000  > solvers.out.316
 ##  ILU-FlexGMRES
-mpirun -np 2  ./ij -solver 82 -ilu_type 10 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.317
-mpirun -np 2  ./ij -solver 82 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.318
-mpirun -np 2  ./ij -solver 82 -ilu_type 20 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.319
-mpirun -np 2  ./ij -solver 82 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.320
-mpirun -np 2  ./ij -solver 82 -ilu_type 40 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.321
-mpirun -np 2  ./ij -solver 82 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.322
+mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 10 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.317
+mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.318
+mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 20 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.319
+mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.320
+mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 40 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.321
+mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.322
 ## RAP-ILU
-mpirun -np 2  ./ij -solver 82 -ilu_type 50 -ilu_lfil 0 > solvers.out.323
+mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 50 -ilu_lfil 0 > solvers.out.323
 ## ILU smoother for AMG
-mpirun -np 2  ./ij -solver 0 -smtype 5  -smlv 1 -ilu_type 30 > solvers.out.324
-mpirun -np 2  ./ij -solver 0 -smtype 15 -smlv 1 -ilu_type 30 > solvers.out.325
+mpirun -np 2  ./ij -exec_host -solver 0 -smtype 5  -smlv 1 -ilu_type 30 > solvers.out.324
+mpirun -np 2  ./ij -exec_host -solver 0 -smtype 15 -smlv 1 -ilu_type 30 > solvers.out.325
diff --git a/src/utilities/general.c b/src/utilities/general.c
index 9bafe21224..ef413f3395 100644
--- a/src/utilities/general.c
+++ b/src/utilities/general.c
@@ -54,13 +54,6 @@ hypre_HandleCreate()
    hypre_HandleDeviceData(hypre_handle_) = hypre_DeviceDataCreate();
 #endif
 
-// WM: temporarily set the default exec policy to host for sycl until more functionality is available
-#if defined(HYPRE_USING_SYCL)
-   hypre_HandleDefaultExecPolicy(hypre_handle_) = HYPRE_EXEC_HOST;
-   hypre_HandleStructExecPolicy(hypre_handle_) = HYPRE_EXEC_HOST;
-   hypre_HandleDeviceData(hypre_handle_) = hypre_DeviceDataCreate();
-#endif
-
    return hypre_handle_;
 }
 
@@ -76,7 +69,13 @@ hypre_HandleDestroy(hypre_Handle *hypre_handle_)
    hypre_DeviceDataDestroy(hypre_HandleDeviceData(hypre_handle_));
 #endif
 
+// WM: in debug mode, hypre_TFree() checks the pointer location, which requires the
+// hypre_handle_'s compute queue if using sycl. But this was just destroyed above.
+#if defined(HYPRE_DEBUG) && defined(HYPRE_USING_SYCL)
+   free(hypre_handle_);
+#else
    hypre_TFree(hypre_handle_, HYPRE_MEMORY_HOST);
+#endif
 
    return hypre_error_flag;
 }
diff --git a/src/utilities/memory.c b/src/utilities/memory.c
index dfb8a2e939..ece79f5d68 100644
--- a/src/utilities/memory.c
+++ b/src/utilities/memory.c
@@ -78,7 +78,7 @@ hypre_DeviceMemset(void *ptr, HYPRE_Int value, size_t num)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   (hypre_HandleComputeStream(hypre_handle()))->memset(ptr, value, num).wait();
+   HYPRE_SYCL_CALL( (hypre_HandleComputeStream(hypre_handle()))->memset(ptr, value, num).wait() );
 #endif
 }
 
@@ -99,7 +99,7 @@ hypre_UnifiedMemset(void *ptr, HYPRE_Int value, size_t num)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   (hypre_HandleComputeStream(hypre_handle()))->memset(ptr, value, num).wait();
+   HYPRE_SYCL_CALL( (hypre_HandleComputeStream(hypre_handle()))->memset(ptr, value, num).wait() );
 #endif
 }
 
@@ -267,7 +267,7 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleComputeStream(hypre_handle())));
+   HYPRE_SYCL_CALL( ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleComputeStream(hypre_handle()))) );
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */
@@ -308,7 +308,7 @@ hypre_HostPinnedMalloc(size_t size, HYPRE_Int zeroinit)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   ptr = (void *)sycl::malloc_host(size, *(hypre_HandleComputeStream(hypre_handle())));
+   HYPRE_SYCL_CALL( ptr = (void *)sycl::malloc_host(size, *(hypre_HandleComputeStream(hypre_handle()))) );
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_PINNED) */
@@ -405,7 +405,7 @@ hypre_DeviceFree(void *ptr)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle())));
+   HYPRE_SYCL_CALL( sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle()))) );
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_DEVICE) */
@@ -435,7 +435,7 @@ hypre_UnifiedFree(void *ptr)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle())));
+   HYPRE_SYCL_CALL( sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle()))) );
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */
@@ -461,7 +461,7 @@ hypre_HostPinnedFree(void *ptr)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle())));
+   HYPRE_SYCL_CALL( sycl::free(ptr, *(hypre_HandleComputeStream(hypre_handle()))) );
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_PINNED) */
@@ -566,7 +566,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-      q->memcpy(dst, src, size).wait();
+      HYPRE_SYCL_CALL( q->memcpy(dst, src, size).wait() );
 #endif
       return;
    }
@@ -588,7 +588,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-      q->memcpy(dst, src, size).wait();
+      HYPRE_SYCL_CALL( q->memcpy(dst, src, size).wait() );
 #endif
       return;
    }
@@ -610,7 +610,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-      q->memcpy(dst, src, size).wait();
+      HYPRE_SYCL_CALL( q->memcpy(dst, src, size).wait() );
 #endif
       return;
    }
@@ -637,7 +637,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-      q->memcpy(dst, src, size).wait();
+      HYPRE_SYCL_CALL( q->memcpy(dst, src, size).wait() );
 #endif
       return;
    }
@@ -664,7 +664,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-      q->memcpy(dst, src, size).wait();
+      HYPRE_SYCL_CALL( q->memcpy(dst, src, size).wait() );
 #endif
       return;
    }
@@ -692,7 +692,7 @@ hypre_Memcpy_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_ds
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-      q->memcpy(dst, src, size).wait();
+      HYPRE_SYCL_CALL( q->memcpy(dst, src, size).wait() );
 #endif
       return;
    }

From c58f9445b88189e67f4c3056e2f2363a42c08a8e Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <wbm@jlselogin4.ftm.alcf.anl.gov>
Date: Tue, 3 Aug 2021 01:25:19 +0000
Subject: [PATCH 05/44] Start boxloop implementation

Starting to put in boxloop sycl code. This compiles,
but crashes.
---
 src/struct_mv/_hypre_struct_mv.h   |   2 +-
 src/struct_mv/_hypre_struct_mv.hpp | 484 +++++++++++++++++++++++++++++
 src/struct_mv/boxloop_sycl.h       | 482 ++++++++++++++++++++++++++++
 src/struct_mv/headers              |   9 +-
 src/utilities/_hypre_utilities.hpp | 126 +++++++-
 src/utilities/device_utils.c       |  41 +++
 src/utilities/device_utils.h       | 126 +++++++-
 7 files changed, 1260 insertions(+), 10 deletions(-)
 create mode 100644 src/struct_mv/boxloop_sycl.h

diff --git a/src/struct_mv/_hypre_struct_mv.h b/src/struct_mv/_hypre_struct_mv.h
index f95c4a74a0..70dbdf9f41 100644
--- a/src/struct_mv/_hypre_struct_mv.h
+++ b/src/struct_mv/_hypre_struct_mv.h
@@ -2484,7 +2484,7 @@ hypre__J = hypre__thread;  i1 = i2 = 0; \
 #endif
 
 
-#elif !defined(HYPRE_USING_RAJA) && !defined(HYPRE_USING_KOKKOS) && !defined(HYPRE_USING_CUDA) && !defined(HYPRE_USING_HIP)
+#elif !defined(HYPRE_USING_RAJA) && !defined(HYPRE_USING_KOKKOS) && !defined(HYPRE_USING_CUDA) && !defined(HYPRE_USING_HIP) && !defined(HYPRE_USING_SYCL)
 /******************************************************************************
  * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
  * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index bf411411e6..d0674b2b0e 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1149,6 +1149,490 @@ else                                                            \
 
 #endif /* #ifndef HYPRE_BOXLOOP_CUDA_HEADER */
 
+#elif defined(HYPRE_USING_SYCL)
+/******************************************************************************
+ * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
+ * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_BOXLOOP_SYCL_HEADER
+#define HYPRE_BOXLOOP_SYCL_HEADER
+
+typedef struct hypre_Boxloop_struct
+{
+   HYPRE_Int lsize0,lsize1,lsize2;
+   HYPRE_Int strides0,strides1,strides2;
+   HYPRE_Int bstart0,bstart1,bstart2;
+   HYPRE_Int bsize0,bsize1,bsize2;
+} hypre_Boxloop;
+
+
+
+
+/*********************************************************************
+ * put this in _hypre_utilities.hpp ? 
+ *********************************************************************/
+#define HYPRE_SYCL_1D_LAUNCH(kernel_name, gridsize, blocksize, ...)                                                  \
+{                                                                                                                    \
+   if ( gridsize[0] == 0 || blocksize[0] == 0 )                                                                      \
+   {                                                                                                                 \
+      hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n",                          \
+                   __FILE__, __LINE__,                                                                               \
+                   gridsize[0], blocksize[0]);                                                                       \
+      assert(0); exit(1);                                                                                            \
+   }                                                                                                                 \
+   else                                                                                                              \
+   {                                                                                                                 \
+      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)                                     \
+         {                                                                                                           \
+            cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), [=] (sycl::nd_item<1> item)           \
+            { (kernel_name)(item, __VA_ARGS__); } );                                                                 \
+         }).wait_and_throw();                                                                                        \
+   }                                                                                                                 \
+}
+
+
+
+#ifdef __cplusplus
+extern "C++" {
+#endif
+
+/*********************************************************************
+ * forall function and kernel
+ *********************************************************************/
+
+template <typename LOOP_BODY>
+void
+forall_kernel( sycl::nd_item<1> item,
+               LOOP_BODY loop_body,
+               HYPRE_Int length )
+{
+   const HYPRE_Int idx = hypre_cuda_get_grid_thread_id<1>(item);
+
+   if (idx < length)
+   {
+      loop_body(idx);
+   }
+}
+
+template<typename LOOP_BODY>
+void
+BoxLoopforall( LOOP_BODY loop_body,
+               HYPRE_Int length )
+{
+   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
+   /* WM: TODO: uncomment above and remove below */
+   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
+
+   if (exec_policy == HYPRE_EXEC_HOST)
+   {
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for HYPRE_SMP_SCHEDULE
+#endif
+      for (HYPRE_Int idx = 0; idx < length; idx++)
+      {
+         loop_body(idx);
+      }
+   }
+   else if (exec_policy == HYPRE_EXEC_DEVICE)
+   {
+      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
+      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+
+      /* HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); */
+      HYPRE_SYCL_1D_LAUNCH(forall_kernel, gDim, bDim, loop_body, length);
+   }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+/*********************************************************************
+ * Init/Declare/IncK etc.
+ *********************************************************************/
+
+/* Get 1-D length of the loop, in hypre__tot */
+#define hypre_newBoxLoopInit(ndim, loop_size)              \
+   HYPRE_Int hypre__tot = 1;                               \
+   for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \
+   {                                                       \
+      hypre__tot *= loop_size[hypre_d];                    \
+   }
+
+/* Initialize struct for box-k */
+#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \
+   hypre_Boxloop databox##k;                                               \
+   /* dim 0 */                                                             \
+   databox##k.lsize0   = loop_size[0];                                     \
+   databox##k.strides0 = stride[0];                                        \
+   databox##k.bstart0  = start[0] - dbox->imin[0];                         \
+   databox##k.bsize0   = dbox->imax[0] - dbox->imin[0];                    \
+   /* dim 1 */                                                             \
+   if (ndim > 1)                                                           \
+   {                                                                       \
+      databox##k.lsize1   = loop_size[1];                                  \
+      databox##k.strides1 = stride[1];                                     \
+      databox##k.bstart1  = start[1] - dbox->imin[1];                      \
+      databox##k.bsize1   = dbox->imax[1] - dbox->imin[1];                 \
+   }                                                                       \
+   else                                                                    \
+   {                                                                       \
+      databox##k.lsize1   = 1;                                             \
+      databox##k.strides1 = 0;                                             \
+      databox##k.bstart1  = 0;                                             \
+      databox##k.bsize1   = 0;                                             \
+   }                                                                       \
+   /* dim 2 */                                                             \
+   if (ndim == 3)                                                          \
+   {                                                                       \
+      databox##k.lsize2   = loop_size[2];                                  \
+      databox##k.strides2 = stride[2];                                     \
+      databox##k.bstart2  = start[2] - dbox->imin[2];                      \
+      databox##k.bsize2   = dbox->imax[2] - dbox->imin[2];                 \
+   }                                                                       \
+   else                                                                    \
+   {                                                                       \
+      databox##k.lsize2   = 1;                                             \
+      databox##k.strides2 = 0;                                             \
+      databox##k.bstart2  = 0;                                             \
+      databox##k.bsize2   = 0;                                             \
+   }
+
+/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */
+#define hypre_newBoxLoopDeclare(box)                     \
+   hypre_Index local_idx;                                \
+   HYPRE_Int idx_local = idx;                            \
+   hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
+   idx_local = idx_local / box.lsize0;                   \
+   hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
+   idx_local = idx_local / box.lsize1;                   \
+   hypre_IndexD(local_idx, 2)  = idx_local % box.lsize2; \
+
+/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */
+#define hypre_BoxLoopIncK(k, box, hypre__i)                                               \
+   HYPRE_Int hypre_boxD##k = 1;                                                           \
+   HYPRE_Int hypre__i = 0;                                                                \
+   hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);                                         \
+   hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);                                         \
+   hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);
+
+/* /1* get 3-D local_idx into 'index' *1/ */
+/* #define hypre_BoxLoopGetIndex(index)      \ */
+/*    index[0] = hypre_IndexD(local_idx, 0); \ */
+/*    index[1] = hypre_IndexD(local_idx, 1); \ */
+/*    index[2] = hypre_IndexD(local_idx, 2); */
+
+
+
+/* BoxLoop 1 */
+#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   BoxLoopforall( [=] (HYPRE_Int idx)                                             \
+   {                                                                                                  \
+      hypre_newBoxLoopDeclare(databox1);                                                              \
+      hypre_BoxLoopIncK(1, databox1, i1);
+
+#define hypre_newBoxLoop1End(i1)                                                                      \
+   }, hypre__tot);                                                                                                \
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/*********************************************************************
+ * HOST IMPLEMENTATION
+ *********************************************************************/
+
+#ifdef HYPRE_USING_OPENMP
+#define HYPRE_BOX_REDUCTION
+#if defined(WIN32) && defined(_MSC_VER)
+#define Pragma(x) __pragma(HYPRE_XSTR(x))
+#else
+#define Pragma(x) _Pragma(HYPRE_XSTR(x))
+#endif
+#define OMP0 Pragma(omp parallel for HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE)
+#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE)
+#else /* #ifdef HYPRE_USING_OPENMP */
+#define OMP0
+#define OMP1
+#endif /* #ifdef HYPRE_USING_OPENMP */
+
+#define zypre_newBoxLoop0Begin(ndim, loop_size)                               \
+{                                                                             \
+   zypre_BoxLoopDeclare();                                                    \
+   zypre_BoxLoopInit(ndim, loop_size);                                        \
+   OMP1                                                                       \
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
+   {                                                                          \
+      zypre_BoxLoopSet();                                                     \
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
+      {                                                                       \
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
+         {
+
+#define zypre_newBoxLoop0End()                                                \
+         }                                                                    \
+         zypre_BoxLoopInc1();                                                 \
+         zypre_BoxLoopInc2();                                                 \
+      }                                                                       \
+   }                                                                          \
+}
+
+#define zypre_newBoxLoop1Begin(ndim, loop_size,                               \
+                               dbox1, start1, stride1, i1)                    \
+{                                                                             \
+   HYPRE_Int i1;                                                              \
+   zypre_BoxLoopDeclare();                                                    \
+   zypre_BoxLoopDeclareK(1);                                                  \
+   zypre_BoxLoopInit(ndim, loop_size);                                        \
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
+   OMP1                                                                       \
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
+   {                                                                          \
+      HYPRE_Int i1;                                                           \
+      zypre_BoxLoopSet();                                                     \
+      zypre_BoxLoopSetK(1, i1);                                               \
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
+      {                                                                       \
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
+         {
+
+#define zypre_newBoxLoop1End(i1)                                              \
+            i1 += hypre__i0inc1;                                              \
+         }                                                                    \
+         zypre_BoxLoopInc1();                                                 \
+         i1 += hypre__ikinc1[hypre__d];                                       \
+         zypre_BoxLoopInc2();                                                 \
+      }                                                                       \
+   }                                                                          \
+}
+
+
+#define zypre_newBoxLoop2Begin(ndim, loop_size,                               \
+                               dbox1, start1, stride1, i1,                    \
+                               dbox2, start2, stride2, i2)                    \
+{                                                                             \
+   HYPRE_Int i1, i2;                                                          \
+   zypre_BoxLoopDeclare();                                                    \
+   zypre_BoxLoopDeclareK(1);                                                  \
+   zypre_BoxLoopDeclareK(2);                                                  \
+   zypre_BoxLoopInit(ndim, loop_size);                                        \
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);                         \
+   OMP1                                                                       \
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
+   {                                                                          \
+      HYPRE_Int i1, i2;                                                       \
+      zypre_BoxLoopSet();                                                     \
+      zypre_BoxLoopSetK(1, i1);                                               \
+      zypre_BoxLoopSetK(2, i2);                                               \
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
+      {                                                                       \
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
+         {
+
+#define zypre_newBoxLoop2End(i1, i2)                                          \
+            i1 += hypre__i0inc1;                                              \
+            i2 += hypre__i0inc2;                                              \
+         }                                                                    \
+         zypre_BoxLoopInc1();                                                 \
+         i1 += hypre__ikinc1[hypre__d];                                       \
+         i2 += hypre__ikinc2[hypre__d];                                       \
+         zypre_BoxLoopInc2();                                                 \
+      }                                                                       \
+   }                                                                          \
+}
+
+
+#define zypre_newBoxLoop3Begin(ndim, loop_size,                               \
+                               dbox1, start1, stride1, i1,                    \
+                               dbox2, start2, stride2, i2,                    \
+                               dbox3, start3, stride3, i3)                    \
+{                                                                             \
+   HYPRE_Int i1, i2, i3;                                                      \
+   zypre_BoxLoopDeclare();                                                    \
+   zypre_BoxLoopDeclareK(1);                                                  \
+   zypre_BoxLoopDeclareK(2);                                                  \
+   zypre_BoxLoopDeclareK(3);                                                  \
+   zypre_BoxLoopInit(ndim, loop_size);                                        \
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);                         \
+   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);                         \
+   OMP1                                                                       \
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
+   {                                                                          \
+      HYPRE_Int i1, i2, i3;                                                   \
+      zypre_BoxLoopSet();                                                     \
+      zypre_BoxLoopSetK(1, i1);                                               \
+      zypre_BoxLoopSetK(2, i2);                                               \
+      zypre_BoxLoopSetK(3, i3);                                               \
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
+      {                                                                       \
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
+         {
+
+#define zypre_newBoxLoop3End(i1, i2, i3)                                      \
+            i1 += hypre__i0inc1;                                              \
+            i2 += hypre__i0inc2;                                              \
+            i3 += hypre__i0inc3;                                              \
+         }                                                                    \
+         zypre_BoxLoopInc1();                                                 \
+         i1 += hypre__ikinc1[hypre__d];                                       \
+         i2 += hypre__ikinc2[hypre__d];                                       \
+         i3 += hypre__ikinc3[hypre__d];                                       \
+         zypre_BoxLoopInc2();                                                 \
+      }                                                                       \
+   }                                                                          \
+}
+
+#define zypre_newBoxLoop4Begin(ndim, loop_size,                               \
+                            dbox1, start1, stride1, i1,                       \
+                            dbox2, start2, stride2, i2,                       \
+                            dbox3, start3, stride3, i3,                       \
+                            dbox4, start4, stride4, i4)                       \
+{                                                                             \
+   HYPRE_Int i1, i2, i3, i4;                                                  \
+   zypre_BoxLoopDeclare();                                                    \
+   zypre_BoxLoopDeclareK(1);                                                  \
+   zypre_BoxLoopDeclareK(2);                                                  \
+   zypre_BoxLoopDeclareK(3);                                                  \
+   zypre_BoxLoopDeclareK(4);                                                  \
+   zypre_BoxLoopInit(ndim, loop_size);                                        \
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);                         \
+   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);                         \
+   zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4);                         \
+   OMP1                                                                       \
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
+   {                                                                          \
+      HYPRE_Int i1, i2, i3, i4;                                               \
+      zypre_BoxLoopSet();                                                     \
+      zypre_BoxLoopSetK(1, i1);                                               \
+      zypre_BoxLoopSetK(2, i2);                                               \
+      zypre_BoxLoopSetK(3, i3);                                               \
+      zypre_BoxLoopSetK(4, i4);                                               \
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
+      {                                                                       \
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
+         {
+
+#define zypre_newBoxLoop4End(i1, i2, i3, i4)                                  \
+            i1 += hypre__i0inc1;                                              \
+            i2 += hypre__i0inc2;                                              \
+            i3 += hypre__i0inc3;                                              \
+            i4 += hypre__i0inc4;                                              \
+         }                                                                    \
+         zypre_BoxLoopInc1();                                                 \
+         i1 += hypre__ikinc1[hypre__d];                                       \
+         i2 += hypre__ikinc2[hypre__d];                                       \
+         i3 += hypre__ikinc3[hypre__d];                                       \
+         i4 += hypre__ikinc4[hypre__d];                                       \
+         zypre_BoxLoopInc2();                                                 \
+      }                                                                       \
+   }                                                                          \
+}
+
+#define zypre_newBasicBoxLoop2Begin(ndim, loop_size,                          \
+                                    stride1, i1,                              \
+                                    stride2, i2)                              \
+{                                                                             \
+   zypre_BoxLoopDeclare();                                                    \
+   zypre_BoxLoopDeclareK(1);                                                  \
+   zypre_BoxLoopDeclareK(2);                                                  \
+   zypre_BoxLoopInit(ndim, loop_size);                                        \
+   zypre_BasicBoxLoopInitK(1, stride1);                                       \
+   zypre_BasicBoxLoopInitK(2, stride2);                                       \
+   OMP1                                                                       \
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
+   {                                                                          \
+      HYPRE_Int i1, i2;                                                       \
+      zypre_BoxLoopSet();                                                     \
+      zypre_BoxLoopSetK(1, i1);                                               \
+      zypre_BoxLoopSetK(2, i2);                                               \
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
+      {                                                                       \
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
+         {
+
+
+#define hypre_LoopBegin(size, idx)                                            \
+{                                                                             \
+   HYPRE_Int idx;                                                             \
+   OMP0                                                                       \
+   for (idx = 0; idx < size; idx ++)                                          \
+   {
+
+#define hypre_LoopEnd()                                                       \
+  }                                                                           \
+}
+
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+
+#define hypre_BoxLoopBlock       zypre_BoxLoopBlock
+#define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
+#define hypre_BoxLoop0End        zypre_newBoxLoop0End
+/* #define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin */
+/* #define hypre_BoxLoop1End        zypre_newBoxLoop1End */
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
+#define hypre_BoxLoop2End        zypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin
+#define hypre_BoxLoop3End        zypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      zypre_newBoxLoop4Begin
+#define hypre_BoxLoop4End        zypre_newBoxLoop4End
+#define hypre_BasicBoxLoop2Begin zypre_newBasicBoxLoop2Begin
+
+/* Reduction */
+#define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \
+        hypre_BoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)
+
+#define hypre_BoxLoop1ReductionEnd(i1, reducesum) \
+        hypre_BoxLoop1End(i1)
+
+#define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \
+                                                      dbox2, start2, stride2, i2, reducesum) \
+        hypre_BoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \
+                                             dbox2, start2, stride2, i2)
+
+#define hypre_BoxLoop2ReductionEnd(i1, i2, reducesum) \
+        hypre_BoxLoop2End(i1, i2)
+
+#endif
+
 #endif
 
 #ifdef __cplusplus
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
new file mode 100644
index 0000000000..3874668ef8
--- /dev/null
+++ b/src/struct_mv/boxloop_sycl.h
@@ -0,0 +1,482 @@
+/******************************************************************************
+ * Copyright 1998-2019 Lawrence Livermore National Security, LLC and other
+ * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
+ *
+ * SPDX-License-Identifier: (Apache-2.0 OR MIT)
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Header info for the BoxLoop
+ *
+ *****************************************************************************/
+
+/*--------------------------------------------------------------------------
+ * BoxLoop macros:
+ *--------------------------------------------------------------------------*/
+
+#ifndef HYPRE_BOXLOOP_SYCL_HEADER
+#define HYPRE_BOXLOOP_SYCL_HEADER
+
+typedef struct hypre_Boxloop_struct
+{
+   HYPRE_Int lsize0,lsize1,lsize2;
+   HYPRE_Int strides0,strides1,strides2;
+   HYPRE_Int bstart0,bstart1,bstart2;
+   HYPRE_Int bsize0,bsize1,bsize2;
+} hypre_Boxloop;
+
+
+
+
+/*********************************************************************
+ * put this in _hypre_utilities.hpp ? 
+ *********************************************************************/
+#define HYPRE_SYCL_1D_LAUNCH(kernel_name, gridsize, blocksize, ...)                                                  \
+{                                                                                                                    \
+   if ( gridsize[0] == 0 || blocksize[0] == 0 )                                                                      \
+   {                                                                                                                 \
+      hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n",                          \
+                   __FILE__, __LINE__,                                                                               \
+                   gridsize[0], blocksize[0]);                                                                       \
+      assert(0); exit(1);                                                                                            \
+   }                                                                                                                 \
+   else                                                                                                              \
+   {                                                                                                                 \
+      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)                                     \
+         {                                                                                                           \
+            cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), [=] (sycl::nd_item<1> item)           \
+            { (kernel_name)(item, __VA_ARGS__); } );                                                                 \
+         }).wait_and_throw();                                                                                        \
+   }                                                                                                                 \
+}
+
+
+
+#ifdef __cplusplus
+extern "C++" {
+#endif
+
+/*********************************************************************
+ * forall function and kernel
+ *********************************************************************/
+
+template <typename LOOP_BODY>
+void
+forall_kernel( sycl::nd_item<1> item,
+               LOOP_BODY loop_body,
+               HYPRE_Int length )
+{
+   const HYPRE_Int idx = hypre_cuda_get_grid_thread_id<1>(item);
+
+   if (idx < length)
+   {
+      loop_body(idx);
+   }
+}
+
+template<typename LOOP_BODY>
+void
+BoxLoopforall( LOOP_BODY loop_body,
+               HYPRE_Int length )
+{
+   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
+   /* WM: TODO: uncomment above and remove below */
+   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
+
+   if (exec_policy == HYPRE_EXEC_HOST)
+   {
+#ifdef HYPRE_USING_OPENMP
+#pragma omp parallel for HYPRE_SMP_SCHEDULE
+#endif
+      for (HYPRE_Int idx = 0; idx < length; idx++)
+      {
+         loop_body(idx);
+      }
+   }
+   else if (exec_policy == HYPRE_EXEC_DEVICE)
+   {
+      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
+      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+
+      /* HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); */
+      HYPRE_SYCL_1D_LAUNCH(forall_kernel, gDim, bDim, loop_body, length);
+   }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+/*********************************************************************
+ * Init/Declare/IncK etc.
+ *********************************************************************/
+
+/* Get 1-D length of the loop, in hypre__tot */
+#define hypre_newBoxLoopInit(ndim, loop_size)              \
+   HYPRE_Int hypre__tot = 1;                               \
+   for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \
+   {                                                       \
+      hypre__tot *= loop_size[hypre_d];                    \
+   }
+
+/* Initialize struct for box-k */
+#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \
+   hypre_Boxloop databox##k;                                               \
+   /* dim 0 */                                                             \
+   databox##k.lsize0   = loop_size[0];                                     \
+   databox##k.strides0 = stride[0];                                        \
+   databox##k.bstart0  = start[0] - dbox->imin[0];                         \
+   databox##k.bsize0   = dbox->imax[0] - dbox->imin[0];                    \
+   /* dim 1 */                                                             \
+   if (ndim > 1)                                                           \
+   {                                                                       \
+      databox##k.lsize1   = loop_size[1];                                  \
+      databox##k.strides1 = stride[1];                                     \
+      databox##k.bstart1  = start[1] - dbox->imin[1];                      \
+      databox##k.bsize1   = dbox->imax[1] - dbox->imin[1];                 \
+   }                                                                       \
+   else                                                                    \
+   {                                                                       \
+      databox##k.lsize1   = 1;                                             \
+      databox##k.strides1 = 0;                                             \
+      databox##k.bstart1  = 0;                                             \
+      databox##k.bsize1   = 0;                                             \
+   }                                                                       \
+   /* dim 2 */                                                             \
+   if (ndim == 3)                                                          \
+   {                                                                       \
+      databox##k.lsize2   = loop_size[2];                                  \
+      databox##k.strides2 = stride[2];                                     \
+      databox##k.bstart2  = start[2] - dbox->imin[2];                      \
+      databox##k.bsize2   = dbox->imax[2] - dbox->imin[2];                 \
+   }                                                                       \
+   else                                                                    \
+   {                                                                       \
+      databox##k.lsize2   = 1;                                             \
+      databox##k.strides2 = 0;                                             \
+      databox##k.bstart2  = 0;                                             \
+      databox##k.bsize2   = 0;                                             \
+   }
+
+/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */
+#define hypre_newBoxLoopDeclare(box)                     \
+   hypre_Index local_idx;                                \
+   HYPRE_Int idx_local = idx;                            \
+   hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
+   idx_local = idx_local / box.lsize0;                   \
+   hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
+   idx_local = idx_local / box.lsize1;                   \
+   hypre_IndexD(local_idx, 2)  = idx_local % box.lsize2; \
+
+/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */
+#define hypre_BoxLoopIncK(k, box, hypre__i)                                               \
+   HYPRE_Int hypre_boxD##k = 1;                                                           \
+   HYPRE_Int hypre__i = 0;                                                                \
+   hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);                                         \
+   hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);                                         \
+   hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);
+
+/* /1* get 3-D local_idx into 'index' *1/ */
+/* #define hypre_BoxLoopGetIndex(index)      \ */
+/*    index[0] = hypre_IndexD(local_idx, 0); \ */
+/*    index[1] = hypre_IndexD(local_idx, 1); \ */
+/*    index[2] = hypre_IndexD(local_idx, 2); */
+
+
+
+/* BoxLoop 1 */
+#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   BoxLoopforall( [=] (HYPRE_Int idx)                                             \
+   {                                                                                                  \
+      hypre_newBoxLoopDeclare(databox1);                                                              \
+      hypre_BoxLoopIncK(1, databox1, i1);
+
+#define hypre_newBoxLoop1End(i1)                                                                      \
+   }, hypre__tot);                                                                                                \
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/*********************************************************************
+ * HOST IMPLEMENTATION
+ *********************************************************************/
+
+#ifdef HYPRE_USING_OPENMP
+#define HYPRE_BOX_REDUCTION
+#if defined(WIN32) && defined(_MSC_VER)
+#define Pragma(x) __pragma(HYPRE_XSTR(x))
+#else
+#define Pragma(x) _Pragma(HYPRE_XSTR(x))
+#endif
+#define OMP0 Pragma(omp parallel for HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE)
+#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE)
+#else /* #ifdef HYPRE_USING_OPENMP */
+#define OMP0
+#define OMP1
+#endif /* #ifdef HYPRE_USING_OPENMP */
+
+#define zypre_newBoxLoop0Begin(ndim, loop_size)                               \
+{                                                                             \
+   zypre_BoxLoopDeclare();                                                    \
+   zypre_BoxLoopInit(ndim, loop_size);                                        \
+   OMP1                                                                       \
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
+   {                                                                          \
+      zypre_BoxLoopSet();                                                     \
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
+      {                                                                       \
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
+         {
+
+#define zypre_newBoxLoop0End()                                                \
+         }                                                                    \
+         zypre_BoxLoopInc1();                                                 \
+         zypre_BoxLoopInc2();                                                 \
+      }                                                                       \
+   }                                                                          \
+}
+
+#define zypre_newBoxLoop1Begin(ndim, loop_size,                               \
+                               dbox1, start1, stride1, i1)                    \
+{                                                                             \
+   HYPRE_Int i1;                                                              \
+   zypre_BoxLoopDeclare();                                                    \
+   zypre_BoxLoopDeclareK(1);                                                  \
+   zypre_BoxLoopInit(ndim, loop_size);                                        \
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
+   OMP1                                                                       \
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
+   {                                                                          \
+      HYPRE_Int i1;                                                           \
+      zypre_BoxLoopSet();                                                     \
+      zypre_BoxLoopSetK(1, i1);                                               \
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
+      {                                                                       \
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
+         {
+
+#define zypre_newBoxLoop1End(i1)                                              \
+            i1 += hypre__i0inc1;                                              \
+         }                                                                    \
+         zypre_BoxLoopInc1();                                                 \
+         i1 += hypre__ikinc1[hypre__d];                                       \
+         zypre_BoxLoopInc2();                                                 \
+      }                                                                       \
+   }                                                                          \
+}
+
+
+#define zypre_newBoxLoop2Begin(ndim, loop_size,                               \
+                               dbox1, start1, stride1, i1,                    \
+                               dbox2, start2, stride2, i2)                    \
+{                                                                             \
+   HYPRE_Int i1, i2;                                                          \
+   zypre_BoxLoopDeclare();                                                    \
+   zypre_BoxLoopDeclareK(1);                                                  \
+   zypre_BoxLoopDeclareK(2);                                                  \
+   zypre_BoxLoopInit(ndim, loop_size);                                        \
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);                         \
+   OMP1                                                                       \
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
+   {                                                                          \
+      HYPRE_Int i1, i2;                                                       \
+      zypre_BoxLoopSet();                                                     \
+      zypre_BoxLoopSetK(1, i1);                                               \
+      zypre_BoxLoopSetK(2, i2);                                               \
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
+      {                                                                       \
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
+         {
+
+#define zypre_newBoxLoop2End(i1, i2)                                          \
+            i1 += hypre__i0inc1;                                              \
+            i2 += hypre__i0inc2;                                              \
+         }                                                                    \
+         zypre_BoxLoopInc1();                                                 \
+         i1 += hypre__ikinc1[hypre__d];                                       \
+         i2 += hypre__ikinc2[hypre__d];                                       \
+         zypre_BoxLoopInc2();                                                 \
+      }                                                                       \
+   }                                                                          \
+}
+
+
+#define zypre_newBoxLoop3Begin(ndim, loop_size,                               \
+                               dbox1, start1, stride1, i1,                    \
+                               dbox2, start2, stride2, i2,                    \
+                               dbox3, start3, stride3, i3)                    \
+{                                                                             \
+   HYPRE_Int i1, i2, i3;                                                      \
+   zypre_BoxLoopDeclare();                                                    \
+   zypre_BoxLoopDeclareK(1);                                                  \
+   zypre_BoxLoopDeclareK(2);                                                  \
+   zypre_BoxLoopDeclareK(3);                                                  \
+   zypre_BoxLoopInit(ndim, loop_size);                                        \
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);                         \
+   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);                         \
+   OMP1                                                                       \
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
+   {                                                                          \
+      HYPRE_Int i1, i2, i3;                                                   \
+      zypre_BoxLoopSet();                                                     \
+      zypre_BoxLoopSetK(1, i1);                                               \
+      zypre_BoxLoopSetK(2, i2);                                               \
+      zypre_BoxLoopSetK(3, i3);                                               \
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
+      {                                                                       \
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
+         {
+
+#define zypre_newBoxLoop3End(i1, i2, i3)                                      \
+            i1 += hypre__i0inc1;                                              \
+            i2 += hypre__i0inc2;                                              \
+            i3 += hypre__i0inc3;                                              \
+         }                                                                    \
+         zypre_BoxLoopInc1();                                                 \
+         i1 += hypre__ikinc1[hypre__d];                                       \
+         i2 += hypre__ikinc2[hypre__d];                                       \
+         i3 += hypre__ikinc3[hypre__d];                                       \
+         zypre_BoxLoopInc2();                                                 \
+      }                                                                       \
+   }                                                                          \
+}
+
+#define zypre_newBoxLoop4Begin(ndim, loop_size,                               \
+                            dbox1, start1, stride1, i1,                       \
+                            dbox2, start2, stride2, i2,                       \
+                            dbox3, start3, stride3, i3,                       \
+                            dbox4, start4, stride4, i4)                       \
+{                                                                             \
+   HYPRE_Int i1, i2, i3, i4;                                                  \
+   zypre_BoxLoopDeclare();                                                    \
+   zypre_BoxLoopDeclareK(1);                                                  \
+   zypre_BoxLoopDeclareK(2);                                                  \
+   zypre_BoxLoopDeclareK(3);                                                  \
+   zypre_BoxLoopDeclareK(4);                                                  \
+   zypre_BoxLoopInit(ndim, loop_size);                                        \
+   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
+   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);                         \
+   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);                         \
+   zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4);                         \
+   OMP1                                                                       \
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
+   {                                                                          \
+      HYPRE_Int i1, i2, i3, i4;                                               \
+      zypre_BoxLoopSet();                                                     \
+      zypre_BoxLoopSetK(1, i1);                                               \
+      zypre_BoxLoopSetK(2, i2);                                               \
+      zypre_BoxLoopSetK(3, i3);                                               \
+      zypre_BoxLoopSetK(4, i4);                                               \
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
+      {                                                                       \
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
+         {
+
+#define zypre_newBoxLoop4End(i1, i2, i3, i4)                                  \
+            i1 += hypre__i0inc1;                                              \
+            i2 += hypre__i0inc2;                                              \
+            i3 += hypre__i0inc3;                                              \
+            i4 += hypre__i0inc4;                                              \
+         }                                                                    \
+         zypre_BoxLoopInc1();                                                 \
+         i1 += hypre__ikinc1[hypre__d];                                       \
+         i2 += hypre__ikinc2[hypre__d];                                       \
+         i3 += hypre__ikinc3[hypre__d];                                       \
+         i4 += hypre__ikinc4[hypre__d];                                       \
+         zypre_BoxLoopInc2();                                                 \
+      }                                                                       \
+   }                                                                          \
+}
+
+#define zypre_newBasicBoxLoop2Begin(ndim, loop_size,                          \
+                                    stride1, i1,                              \
+                                    stride2, i2)                              \
+{                                                                             \
+   zypre_BoxLoopDeclare();                                                    \
+   zypre_BoxLoopDeclareK(1);                                                  \
+   zypre_BoxLoopDeclareK(2);                                                  \
+   zypre_BoxLoopInit(ndim, loop_size);                                        \
+   zypre_BasicBoxLoopInitK(1, stride1);                                       \
+   zypre_BasicBoxLoopInitK(2, stride2);                                       \
+   OMP1                                                                       \
+   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
+   {                                                                          \
+      HYPRE_Int i1, i2;                                                       \
+      zypre_BoxLoopSet();                                                     \
+      zypre_BoxLoopSetK(1, i1);                                               \
+      zypre_BoxLoopSetK(2, i2);                                               \
+      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
+      {                                                                       \
+         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
+         {
+
+
+#define hypre_LoopBegin(size, idx)                                            \
+{                                                                             \
+   HYPRE_Int idx;                                                             \
+   OMP0                                                                       \
+   for (idx = 0; idx < size; idx ++)                                          \
+   {
+
+#define hypre_LoopEnd()                                                       \
+  }                                                                           \
+}
+
+#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
+
+#define hypre_BoxLoopBlock       zypre_BoxLoopBlock
+#define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
+#define hypre_BoxLoop0End        zypre_newBoxLoop0End
+/* #define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin */
+/* #define hypre_BoxLoop1End        zypre_newBoxLoop1End */
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
+#define hypre_BoxLoop2End        zypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin
+#define hypre_BoxLoop3End        zypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      zypre_newBoxLoop4Begin
+#define hypre_BoxLoop4End        zypre_newBoxLoop4End
+#define hypre_BasicBoxLoop2Begin zypre_newBasicBoxLoop2Begin
+
+/* Reduction */
+#define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \
+        hypre_BoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)
+
+#define hypre_BoxLoop1ReductionEnd(i1, reducesum) \
+        hypre_BoxLoop1End(i1)
+
+#define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \
+                                                      dbox2, start2, stride2, i2, reducesum) \
+        hypre_BoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \
+                                             dbox2, start2, stride2, i2)
+
+#define hypre_BoxLoop2ReductionEnd(i1, i2, reducesum) \
+        hypre_BoxLoop2End(i1, i2)
+
+#endif
diff --git a/src/struct_mv/headers b/src/struct_mv/headers
index 645f39c444..fa32575581 100755
--- a/src/struct_mv/headers
+++ b/src/struct_mv/headers
@@ -61,7 +61,7 @@ cat boxloop_omp_device.h          >> $INTERNAL_HEADER
 
 cat >> $INTERNAL_HEADER <<@
 
-#elif !defined(HYPRE_USING_RAJA) && !defined(HYPRE_USING_KOKKOS) && !defined(HYPRE_USING_CUDA) && !defined(HYPRE_USING_HIP)
+#elif !defined(HYPRE_USING_RAJA) && !defined(HYPRE_USING_KOKKOS) && !defined(HYPRE_USING_CUDA) && !defined(HYPRE_USING_HIP) && !defined(HYPRE_USING_SYCL)
 @
 
 cat boxloop_host.h                >> $INTERNAL_HEADER
@@ -137,6 +137,13 @@ cat boxloop_cuda.h                >> $INTERNAL_HEADER
 
 cat >> $INTERNAL_HEADER <<@
 
+#elif defined(HYPRE_USING_SYCL)
+@
+
+cat boxloop_sycl.h                >> $INTERNAL_HEADER
+
+cat >> $INTERNAL_HEADER <<@
+
 #endif
 @
 
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index baaf16d7df..8a6f05b937 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -70,8 +70,8 @@ struct hypre_umpire_device_allocator
  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
  ******************************************************************************/
 
-#ifndef HYPRE_CUDA_UTILS_H
-#define HYPRE_CUDA_UTILS_H
+#ifndef HYPRE_DEVICE_UTILS_H
+#define HYPRE_DEVICE_UTILS_H
 
 #if defined(HYPRE_USING_GPU)
 
@@ -408,6 +408,125 @@ struct hypre_GpuMatData
 
 #endif //#if defined(HYPRE_USING_GPU)
 
+#if defined(HYPRE_USING_SYCL)
+/* return the number of work-items in current work-group */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_num_threads(sycl::nd_item<dim>& item)
+{
+  return item.get_group().get_local_linear_range();
+}
+
+/* return the flattened or linearlized work-item id in current work-group (not global)*/
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_thread_id(sycl::nd_item<dim>& item)
+{
+  return item.get_local_linear_id();
+}
+
+/* return the number of sub-groups in current work-group */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_num_warps(sycl::nd_item<dim>& item)
+{
+  return item.get_sub_group().get_group_range().get(0);
+}
+
+/* return the sub_group id in work-group */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_warp_id(sycl::nd_item<dim>& item)
+{
+  return item.get_sub_group().get_group_linear_id();
+}
+
+/* return the work-item lane id in a sub_group */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_lane_id(sycl::nd_item<dim>& item)
+{
+  return hypre_cuda_get_thread_id<dim>(item) & (item.get_sub_group().get_local_range().get(0)-1);
+}
+
+/* return the num of work_groups in nd_range */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_num_blocks(sycl::nd_item<dim>& item)
+{
+  // return item.get_group().get_group_linear_range(); // API available in SYCL 2020
+
+  switch (dim)
+  {
+  case 1:
+    return (item.get_group_range(0));
+  case 2:
+    return (item.get_group_range(0) * item.get_group_range(1));
+  case 3:
+    return (item.get_group_range(0) * item.get_group_range(1) * item.get_group_range(2));
+  }
+
+  return -1;
+}
+
+/* return the flattened or linearlized work-group id in nd_range */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_block_id(sycl::nd_item<dim>& item)
+{
+  return item.get_group_linear_id();
+}
+
+/* return the number of work-items in global iteration space*/
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_grid_num_threads(sycl::nd_item<dim>& item)
+{
+  switch (dim)
+  {
+  case 1:
+    return (item.get_global_range(0));
+  case 2:
+    return (item.get_global_range(0) * item.get_global_range(1));
+  case 3:
+    return (item.get_global_range(0) * item.get_global_range(1) * item.get_global_range(2));
+  }
+
+  return -1;
+}
+
+/* return the flattened work-item id in global iteration space */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_grid_thread_id(sycl::nd_item<dim>& item)
+{
+  return item.get_global_linear_id();
+}
+
+/* return the number of sub-groups in global iteration space */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_grid_num_warps(sycl::nd_item<dim>& item)
+{
+  return hypre_cuda_get_num_blocks<dim>(item) * hypre_cuda_get_num_warps<dim>(item);
+}
+
+/* return the flattened sub-group id in global iteration space */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_grid_warp_id(sycl::nd_item<dim>& item)
+{
+  return hypre_cuda_get_block_id<dim>(item) * hypre_cuda_get_num_warps<dim>(item) +
+    hypre_cuda_get_warp_id<dim>(item);
+}
+
+/* device_utils.c */
+sycl::range<1> hypre_GetDefaultCUDABlockDimension();
+
+sycl::range<1> hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, sycl::range<1> bDim );
+
+#endif // #if defined(HYPRE_USING_SYCL)
+
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
 #include <thrust/execution_policy.h>
@@ -532,7 +651,6 @@ using namespace thrust::placeholders;
 
 #endif // HYPRE_USING_UMPIRE_DEVICE
 
-
 /* return the number of threads in block */
 template <hypre_int dim>
 static __device__ __forceinline__
@@ -1013,7 +1131,7 @@ struct equal : public thrust::unary_function<T,bool>
 
 
 
-/* cuda_utils.c */
+/* device_utils.c */
 dim3 hypre_GetDefaultCUDABlockDimension();
 
 dim3 hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim );
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index 3c79401f4e..f5dbdc07a1 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -8,6 +8,47 @@
 #include "_hypre_utilities.h"
 #include "_hypre_utilities.hpp"
 
+#if defined(HYPRE_USING_SYCL)
+#include <CL/sycl.hpp>
+// WM: TODO: verify
+sycl::range<1> hypre_GetDefaultCUDABlockDimension()
+{
+  // 256 - max work group size for Gen9
+  // 512 - max work group size for ATS
+  sycl::range<1> wgDim(64);
+  return wgDim;
+}
+
+// WM: TODO: verify
+sycl::range<1> hypre_GetDefaultCUDAGridDimension(HYPRE_Int n,
+                                                 const char *granularity,
+                                                 sycl::range<1> wgDim)
+{
+   HYPRE_Int num_WGs = 0;
+   HYPRE_Int num_workitems_per_WG = wgDim[0];
+
+   if (granularity[0] == 't')
+   {
+      num_WGs = (n + num_workitems_per_WG - 1) / num_workitems_per_WG;
+   }
+   else if (granularity[0] == 'w')
+   {
+      HYPRE_Int num_subgroups_per_block = num_workitems_per_WG >> HYPRE_WARP_BITSHIFT;
+      hypre_assert(num_subgroups_per_block * HYPRE_WARP_SIZE == num_workitems_per_WG);
+      num_WGs = (n + num_subgroups_per_block - 1) / num_subgroups_per_block;
+   }
+   else
+   {
+      hypre_printf("Error %s %d: Unknown granularity !\n", __FILE__, __LINE__);
+      hypre_assert(0);
+   }
+
+   sycl::range<1> gDim(num_WGs);
+
+   return gDim;
+}
+#endif
+
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
 /*
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index d24e321686..13a01520df 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -5,8 +5,8 @@
  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
  ******************************************************************************/
 
-#ifndef HYPRE_CUDA_UTILS_H
-#define HYPRE_CUDA_UTILS_H
+#ifndef HYPRE_DEVICE_UTILS_H
+#define HYPRE_DEVICE_UTILS_H
 
 #if defined(HYPRE_USING_GPU)
 
@@ -343,6 +343,125 @@ struct hypre_GpuMatData
 
 #endif //#if defined(HYPRE_USING_GPU)
 
+#if defined(HYPRE_USING_SYCL)
+/* return the number of work-items in current work-group */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_num_threads(sycl::nd_item<dim>& item)
+{
+  return item.get_group().get_local_linear_range();
+}
+
+/* return the flattened or linearlized work-item id in current work-group (not global)*/
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_thread_id(sycl::nd_item<dim>& item)
+{
+  return item.get_local_linear_id();
+}
+
+/* return the number of sub-groups in current work-group */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_num_warps(sycl::nd_item<dim>& item)
+{
+  return item.get_sub_group().get_group_range().get(0);
+}
+
+/* return the sub_group id in work-group */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_warp_id(sycl::nd_item<dim>& item)
+{
+  return item.get_sub_group().get_group_linear_id();
+}
+
+/* return the work-item lane id in a sub_group */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_lane_id(sycl::nd_item<dim>& item)
+{
+  return hypre_cuda_get_thread_id<dim>(item) & (item.get_sub_group().get_local_range().get(0)-1);
+}
+
+/* return the num of work_groups in nd_range */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_num_blocks(sycl::nd_item<dim>& item)
+{
+  // return item.get_group().get_group_linear_range(); // API available in SYCL 2020
+
+  switch (dim)
+  {
+  case 1:
+    return (item.get_group_range(0));
+  case 2:
+    return (item.get_group_range(0) * item.get_group_range(1));
+  case 3:
+    return (item.get_group_range(0) * item.get_group_range(1) * item.get_group_range(2));
+  }
+
+  return -1;
+}
+
+/* return the flattened or linearlized work-group id in nd_range */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_block_id(sycl::nd_item<dim>& item)
+{
+  return item.get_group_linear_id();
+}
+
+/* return the number of work-items in global iteration space*/
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_grid_num_threads(sycl::nd_item<dim>& item)
+{
+  switch (dim)
+  {
+  case 1:
+    return (item.get_global_range(0));
+  case 2:
+    return (item.get_global_range(0) * item.get_global_range(1));
+  case 3:
+    return (item.get_global_range(0) * item.get_global_range(1) * item.get_global_range(2));
+  }
+
+  return -1;
+}
+
+/* return the flattened work-item id in global iteration space */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_grid_thread_id(sycl::nd_item<dim>& item)
+{
+  return item.get_global_linear_id();
+}
+
+/* return the number of sub-groups in global iteration space */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_grid_num_warps(sycl::nd_item<dim>& item)
+{
+  return hypre_cuda_get_num_blocks<dim>(item) * hypre_cuda_get_num_warps<dim>(item);
+}
+
+/* return the flattened sub-group id in global iteration space */
+template <hypre_int dim>
+static __inline__ __attribute__((always_inline))
+hypre_int hypre_cuda_get_grid_warp_id(sycl::nd_item<dim>& item)
+{
+  return hypre_cuda_get_block_id<dim>(item) * hypre_cuda_get_num_warps<dim>(item) +
+    hypre_cuda_get_warp_id<dim>(item);
+}
+
+/* device_utils.c */
+sycl::range<1> hypre_GetDefaultCUDABlockDimension();
+
+sycl::range<1> hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, sycl::range<1> bDim );
+
+#endif // #if defined(HYPRE_USING_SYCL)
+
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
 #include <thrust/execution_policy.h>
@@ -467,7 +586,6 @@ using namespace thrust::placeholders;
 
 #endif // HYPRE_USING_UMPIRE_DEVICE
 
-
 /* return the number of threads in block */
 template <hypre_int dim>
 static __device__ __forceinline__
@@ -948,7 +1066,7 @@ struct equal : public thrust::unary_function<T,bool>
 
 
 
-/* cuda_utils.c */
+/* device_utils.c */
 dim3 hypre_GetDefaultCUDABlockDimension();
 
 dim3 hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim );

From 25348d4e26108f963d60a1032d190fb9cad55aa7 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <ac.mitchell82@jlselogin5.ftm.alcf.anl.gov>
Date: Thu, 16 Sep 2021 23:15:09 +0000
Subject: [PATCH 06/44] Remove nonfunctional code for fresh start

---
 src/struct_mv/_hypre_struct_mv.hpp | 181 +----------------------------
 src/struct_mv/boxloop_sycl.h       | 181 +----------------------------
 2 files changed, 4 insertions(+), 358 deletions(-)

diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index d0674b2b0e..beaed26fda 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1181,181 +1181,6 @@ typedef struct hypre_Boxloop_struct
 
 
 
-/*********************************************************************
- * put this in _hypre_utilities.hpp ? 
- *********************************************************************/
-#define HYPRE_SYCL_1D_LAUNCH(kernel_name, gridsize, blocksize, ...)                                                  \
-{                                                                                                                    \
-   if ( gridsize[0] == 0 || blocksize[0] == 0 )                                                                      \
-   {                                                                                                                 \
-      hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n",                          \
-                   __FILE__, __LINE__,                                                                               \
-                   gridsize[0], blocksize[0]);                                                                       \
-      assert(0); exit(1);                                                                                            \
-   }                                                                                                                 \
-   else                                                                                                              \
-   {                                                                                                                 \
-      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)                                     \
-         {                                                                                                           \
-            cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), [=] (sycl::nd_item<1> item)           \
-            { (kernel_name)(item, __VA_ARGS__); } );                                                                 \
-         }).wait_and_throw();                                                                                        \
-   }                                                                                                                 \
-}
-
-
-
-#ifdef __cplusplus
-extern "C++" {
-#endif
-
-/*********************************************************************
- * forall function and kernel
- *********************************************************************/
-
-template <typename LOOP_BODY>
-void
-forall_kernel( sycl::nd_item<1> item,
-               LOOP_BODY loop_body,
-               HYPRE_Int length )
-{
-   const HYPRE_Int idx = hypre_cuda_get_grid_thread_id<1>(item);
-
-   if (idx < length)
-   {
-      loop_body(idx);
-   }
-}
-
-template<typename LOOP_BODY>
-void
-BoxLoopforall( LOOP_BODY loop_body,
-               HYPRE_Int length )
-{
-   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
-   /* WM: TODO: uncomment above and remove below */
-   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
-
-   if (exec_policy == HYPRE_EXEC_HOST)
-   {
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for HYPRE_SMP_SCHEDULE
-#endif
-      for (HYPRE_Int idx = 0; idx < length; idx++)
-      {
-         loop_body(idx);
-      }
-   }
-   else if (exec_policy == HYPRE_EXEC_DEVICE)
-   {
-      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
-      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
-
-      /* HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); */
-      HYPRE_SYCL_1D_LAUNCH(forall_kernel, gDim, bDim, loop_body, length);
-   }
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-/*********************************************************************
- * Init/Declare/IncK etc.
- *********************************************************************/
-
-/* Get 1-D length of the loop, in hypre__tot */
-#define hypre_newBoxLoopInit(ndim, loop_size)              \
-   HYPRE_Int hypre__tot = 1;                               \
-   for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \
-   {                                                       \
-      hypre__tot *= loop_size[hypre_d];                    \
-   }
-
-/* Initialize struct for box-k */
-#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \
-   hypre_Boxloop databox##k;                                               \
-   /* dim 0 */                                                             \
-   databox##k.lsize0   = loop_size[0];                                     \
-   databox##k.strides0 = stride[0];                                        \
-   databox##k.bstart0  = start[0] - dbox->imin[0];                         \
-   databox##k.bsize0   = dbox->imax[0] - dbox->imin[0];                    \
-   /* dim 1 */                                                             \
-   if (ndim > 1)                                                           \
-   {                                                                       \
-      databox##k.lsize1   = loop_size[1];                                  \
-      databox##k.strides1 = stride[1];                                     \
-      databox##k.bstart1  = start[1] - dbox->imin[1];                      \
-      databox##k.bsize1   = dbox->imax[1] - dbox->imin[1];                 \
-   }                                                                       \
-   else                                                                    \
-   {                                                                       \
-      databox##k.lsize1   = 1;                                             \
-      databox##k.strides1 = 0;                                             \
-      databox##k.bstart1  = 0;                                             \
-      databox##k.bsize1   = 0;                                             \
-   }                                                                       \
-   /* dim 2 */                                                             \
-   if (ndim == 3)                                                          \
-   {                                                                       \
-      databox##k.lsize2   = loop_size[2];                                  \
-      databox##k.strides2 = stride[2];                                     \
-      databox##k.bstart2  = start[2] - dbox->imin[2];                      \
-      databox##k.bsize2   = dbox->imax[2] - dbox->imin[2];                 \
-   }                                                                       \
-   else                                                                    \
-   {                                                                       \
-      databox##k.lsize2   = 1;                                             \
-      databox##k.strides2 = 0;                                             \
-      databox##k.bstart2  = 0;                                             \
-      databox##k.bsize2   = 0;                                             \
-   }
-
-/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */
-#define hypre_newBoxLoopDeclare(box)                     \
-   hypre_Index local_idx;                                \
-   HYPRE_Int idx_local = idx;                            \
-   hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
-   idx_local = idx_local / box.lsize0;                   \
-   hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
-   idx_local = idx_local / box.lsize1;                   \
-   hypre_IndexD(local_idx, 2)  = idx_local % box.lsize2; \
-
-/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */
-#define hypre_BoxLoopIncK(k, box, hypre__i)                                               \
-   HYPRE_Int hypre_boxD##k = 1;                                                           \
-   HYPRE_Int hypre__i = 0;                                                                \
-   hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \
-   hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);                                         \
-   hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \
-   hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);                                         \
-   hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \
-   hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);
-
-/* /1* get 3-D local_idx into 'index' *1/ */
-/* #define hypre_BoxLoopGetIndex(index)      \ */
-/*    index[0] = hypre_IndexD(local_idx, 0); \ */
-/*    index[1] = hypre_IndexD(local_idx, 1); \ */
-/*    index[2] = hypre_IndexD(local_idx, 2); */
-
-
-
-/* BoxLoop 1 */
-#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
-{                                                                                                     \
-   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
-   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   BoxLoopforall( [=] (HYPRE_Int idx)                                             \
-   {                                                                                                  \
-      hypre_newBoxLoopDeclare(databox1);                                                              \
-      hypre_BoxLoopIncK(1, databox1, i1);
-
-#define hypre_newBoxLoop1End(i1)                                                                      \
-   }, hypre__tot);                                                                                                \
-}
-
-
-
 
 
 
@@ -1604,10 +1429,8 @@ BoxLoopforall( LOOP_BODY loop_body,
 #define hypre_BoxLoopBlock       zypre_BoxLoopBlock
 #define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
 #define hypre_BoxLoop0End        zypre_newBoxLoop0End
-/* #define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin */
-/* #define hypre_BoxLoop1End        zypre_newBoxLoop1End */
-#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
-#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin
+#define hypre_BoxLoop1End        zypre_newBoxLoop1End
 #define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
 #define hypre_BoxLoop2End        zypre_newBoxLoop2End
 #define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index 3874668ef8..0804d42fb7 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -29,181 +29,6 @@ typedef struct hypre_Boxloop_struct
 
 
 
-/*********************************************************************
- * put this in _hypre_utilities.hpp ? 
- *********************************************************************/
-#define HYPRE_SYCL_1D_LAUNCH(kernel_name, gridsize, blocksize, ...)                                                  \
-{                                                                                                                    \
-   if ( gridsize[0] == 0 || blocksize[0] == 0 )                                                                      \
-   {                                                                                                                 \
-      hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n",                          \
-                   __FILE__, __LINE__,                                                                               \
-                   gridsize[0], blocksize[0]);                                                                       \
-      assert(0); exit(1);                                                                                            \
-   }                                                                                                                 \
-   else                                                                                                              \
-   {                                                                                                                 \
-      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)                                     \
-         {                                                                                                           \
-            cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), [=] (sycl::nd_item<1> item)           \
-            { (kernel_name)(item, __VA_ARGS__); } );                                                                 \
-         }).wait_and_throw();                                                                                        \
-   }                                                                                                                 \
-}
-
-
-
-#ifdef __cplusplus
-extern "C++" {
-#endif
-
-/*********************************************************************
- * forall function and kernel
- *********************************************************************/
-
-template <typename LOOP_BODY>
-void
-forall_kernel( sycl::nd_item<1> item,
-               LOOP_BODY loop_body,
-               HYPRE_Int length )
-{
-   const HYPRE_Int idx = hypre_cuda_get_grid_thread_id<1>(item);
-
-   if (idx < length)
-   {
-      loop_body(idx);
-   }
-}
-
-template<typename LOOP_BODY>
-void
-BoxLoopforall( LOOP_BODY loop_body,
-               HYPRE_Int length )
-{
-   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
-   /* WM: TODO: uncomment above and remove below */
-   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
-
-   if (exec_policy == HYPRE_EXEC_HOST)
-   {
-#ifdef HYPRE_USING_OPENMP
-#pragma omp parallel for HYPRE_SMP_SCHEDULE
-#endif
-      for (HYPRE_Int idx = 0; idx < length; idx++)
-      {
-         loop_body(idx);
-      }
-   }
-   else if (exec_policy == HYPRE_EXEC_DEVICE)
-   {
-      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
-      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
-
-      /* HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); */
-      HYPRE_SYCL_1D_LAUNCH(forall_kernel, gDim, bDim, loop_body, length);
-   }
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-/*********************************************************************
- * Init/Declare/IncK etc.
- *********************************************************************/
-
-/* Get 1-D length of the loop, in hypre__tot */
-#define hypre_newBoxLoopInit(ndim, loop_size)              \
-   HYPRE_Int hypre__tot = 1;                               \
-   for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \
-   {                                                       \
-      hypre__tot *= loop_size[hypre_d];                    \
-   }
-
-/* Initialize struct for box-k */
-#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \
-   hypre_Boxloop databox##k;                                               \
-   /* dim 0 */                                                             \
-   databox##k.lsize0   = loop_size[0];                                     \
-   databox##k.strides0 = stride[0];                                        \
-   databox##k.bstart0  = start[0] - dbox->imin[0];                         \
-   databox##k.bsize0   = dbox->imax[0] - dbox->imin[0];                    \
-   /* dim 1 */                                                             \
-   if (ndim > 1)                                                           \
-   {                                                                       \
-      databox##k.lsize1   = loop_size[1];                                  \
-      databox##k.strides1 = stride[1];                                     \
-      databox##k.bstart1  = start[1] - dbox->imin[1];                      \
-      databox##k.bsize1   = dbox->imax[1] - dbox->imin[1];                 \
-   }                                                                       \
-   else                                                                    \
-   {                                                                       \
-      databox##k.lsize1   = 1;                                             \
-      databox##k.strides1 = 0;                                             \
-      databox##k.bstart1  = 0;                                             \
-      databox##k.bsize1   = 0;                                             \
-   }                                                                       \
-   /* dim 2 */                                                             \
-   if (ndim == 3)                                                          \
-   {                                                                       \
-      databox##k.lsize2   = loop_size[2];                                  \
-      databox##k.strides2 = stride[2];                                     \
-      databox##k.bstart2  = start[2] - dbox->imin[2];                      \
-      databox##k.bsize2   = dbox->imax[2] - dbox->imin[2];                 \
-   }                                                                       \
-   else                                                                    \
-   {                                                                       \
-      databox##k.lsize2   = 1;                                             \
-      databox##k.strides2 = 0;                                             \
-      databox##k.bstart2  = 0;                                             \
-      databox##k.bsize2   = 0;                                             \
-   }
-
-/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */
-#define hypre_newBoxLoopDeclare(box)                     \
-   hypre_Index local_idx;                                \
-   HYPRE_Int idx_local = idx;                            \
-   hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
-   idx_local = idx_local / box.lsize0;                   \
-   hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
-   idx_local = idx_local / box.lsize1;                   \
-   hypre_IndexD(local_idx, 2)  = idx_local % box.lsize2; \
-
-/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */
-#define hypre_BoxLoopIncK(k, box, hypre__i)                                               \
-   HYPRE_Int hypre_boxD##k = 1;                                                           \
-   HYPRE_Int hypre__i = 0;                                                                \
-   hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \
-   hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);                                         \
-   hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \
-   hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);                                         \
-   hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \
-   hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);
-
-/* /1* get 3-D local_idx into 'index' *1/ */
-/* #define hypre_BoxLoopGetIndex(index)      \ */
-/*    index[0] = hypre_IndexD(local_idx, 0); \ */
-/*    index[1] = hypre_IndexD(local_idx, 1); \ */
-/*    index[2] = hypre_IndexD(local_idx, 2); */
-
-
-
-/* BoxLoop 1 */
-#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
-{                                                                                                     \
-   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
-   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   BoxLoopforall( [=] (HYPRE_Int idx)                                             \
-   {                                                                                                  \
-      hypre_newBoxLoopDeclare(databox1);                                                              \
-      hypre_BoxLoopIncK(1, databox1, i1);
-
-#define hypre_newBoxLoop1End(i1)                                                                      \
-   }, hypre__tot);                                                                                                \
-}
-
-
-
 
 
 
@@ -452,10 +277,8 @@ BoxLoopforall( LOOP_BODY loop_body,
 #define hypre_BoxLoopBlock       zypre_BoxLoopBlock
 #define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
 #define hypre_BoxLoop0End        zypre_newBoxLoop0End
-/* #define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin */
-/* #define hypre_BoxLoop1End        zypre_newBoxLoop1End */
-#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
-#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin
+#define hypre_BoxLoop1End        zypre_newBoxLoop1End
 #define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
 #define hypre_BoxLoop2End        zypre_newBoxLoop2End
 #define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin

From 58b6e23f9e8b724a9f026e49ae42d3bbf91e6165 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Thu, 16 Sep 2021 18:08:22 -0700
Subject: [PATCH 07/44] Add simple driver and remove problematic flag from
 configure

---
 src/configure     |   2 +-
 src/test/simple.c | 623 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 624 insertions(+), 1 deletion(-)
 create mode 100644 src/test/simple.c

diff --git a/src/configure b/src/configure
index 328027a730..b3a0f72762 100755
--- a/src/configure
+++ b/src/configure
@@ -9094,7 +9094,7 @@ done
                                         SYCLCXXFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel "
 
                                 if test x"$hypre_using_debug" == x"yes"; then :
-  SYCLCXXFLAGS="-O0 -Wall -g -gdbx ${SYCLCXXFLAGS}"
+  SYCLCXXFLAGS="-O0 -Wall -g ${SYCLCXXFLAGS}"
 elif SYCLCXXFLAGS="-O2 ${SYCLCXXFLAGS}"; then :
 
 fi
diff --git a/src/test/simple.c b/src/test/simple.c
new file mode 100644
index 0000000000..0649fef677
--- /dev/null
+++ b/src/test/simple.c
@@ -0,0 +1,623 @@
+/* WM: todo - remove this file from git */
+
+#include "_hypre_utilities.h"
+#include "_hypre_utilities.hpp"
+#include "HYPRE.h"
+#include "_hypre_struct_mv.h"
+#include "_hypre_struct_mv.hpp"
+
+HYPRE_Int AddValuesVector( hypre_StructGrid  *gridvector,
+                           hypre_StructVector *zvector,
+                           HYPRE_Int          *period,
+                           HYPRE_Real         value  )  ;
+
+
+
+/*********************************************************************
+ * put this in _hypre_utilities.hpp ? 
+ * WM: todo - if you can wrap the basic parallel_for call for use elsewhere...
+ *********************************************************************/
+/* #define HYPRE_SYCL_1D_LAUNCH(kernel_name, gridsize, blocksize, ...)                                                  \ */
+/* {                                                                                                                    \ */
+/*    if ( gridsize[0] == 0 || blocksize[0] == 0 )                                                                      \ */
+/*    {                                                                                                                 \ */
+/*       hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n",                          \ */
+/*                    __FILE__, __LINE__,                                                                               \ */
+/*                    gridsize[0], blocksize[0]);                                                                       \ */
+/*       assert(0); exit(1);                                                                                            \ */
+/*    }                                                                                                                 \ */
+/*    else                                                                                                              \ */
+/*    {                                                                                                                 \ */
+/*       hypre_printf("WM: debug - inside BoxLoopforall(), submitting to queue\n");                                                \ */
+/*       hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)                                     \ */
+/*          {                                                                                                           \ */
+/*             cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), [=] (sycl::nd_item<1> item)           \ */
+/*             { (kernel_name)(item, __VA_ARGS__); } );                                                                 \ */
+/*          }).wait_and_throw();                                                                                        \ */
+/*    }                                                                                                                 \ */
+/* } */
+
+
+
+#ifdef __cplusplus
+extern "C++" {
+#endif
+
+/*********************************************************************
+ * forall function
+ *********************************************************************/
+
+template<typename LOOP_BODY>
+void
+BoxLoopforall( LOOP_BODY loop_body,
+               HYPRE_Int length )
+{
+   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
+   /* WM: TODO: uncomment above and remove below */
+   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
+
+   if (exec_policy == HYPRE_EXEC_HOST)
+   {
+/* WM: todo - is this really necessary, even? */
+/* #ifdef HYPRE_USING_OPENMP */
+/* #pragma omp parallel for HYPRE_SMP_SCHEDULE */
+/* #endif */
+/*       for (HYPRE_Int idx = 0; idx < length; idx++) */
+/*       { */
+/*          loop_body(idx); */
+/*       } */
+   }
+   else if (exec_policy == HYPRE_EXEC_DEVICE)
+   {
+      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
+      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+
+      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
+         {
+            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body);
+         }).wait_and_throw();
+   }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+/*********************************************************************
+ * Init/Declare/IncK etc.
+ *********************************************************************/
+
+/* Get 1-D length of the loop, in hypre__tot */
+#define hypre_newBoxLoopInit(ndim, loop_size)              \
+   HYPRE_Int hypre__tot = 1;                               \
+   for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \
+   {                                                       \
+      hypre__tot *= loop_size[hypre_d];                    \
+   }
+
+/* Initialize struct for box-k */
+#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \
+   hypre_Boxloop databox##k;                                               \
+   /* dim 0 */                                                             \
+   databox##k.lsize0   = loop_size[0];                                     \
+   databox##k.strides0 = stride[0];                                        \
+   databox##k.bstart0  = start[0] - dbox->imin[0];                         \
+   databox##k.bsize0   = dbox->imax[0] - dbox->imin[0];                    \
+   /* dim 1 */                                                             \
+   if (ndim > 1)                                                           \
+   {                                                                       \
+      databox##k.lsize1   = loop_size[1];                                  \
+      databox##k.strides1 = stride[1];                                     \
+      databox##k.bstart1  = start[1] - dbox->imin[1];                      \
+      databox##k.bsize1   = dbox->imax[1] - dbox->imin[1];                 \
+   }                                                                       \
+   else                                                                    \
+   {                                                                       \
+      databox##k.lsize1   = 1;                                             \
+      databox##k.strides1 = 0;                                             \
+      databox##k.bstart1  = 0;                                             \
+      databox##k.bsize1   = 0;                                             \
+   }                                                                       \
+   /* dim 2 */                                                             \
+   if (ndim == 3)                                                          \
+   {                                                                       \
+      databox##k.lsize2   = loop_size[2];                                  \
+      databox##k.strides2 = stride[2];                                     \
+      databox##k.bstart2  = start[2] - dbox->imin[2];                      \
+      databox##k.bsize2   = dbox->imax[2] - dbox->imin[2];                 \
+   }                                                                       \
+   else                                                                    \
+   {                                                                       \
+      databox##k.lsize2   = 1;                                             \
+      databox##k.strides2 = 0;                                             \
+      databox##k.bstart2  = 0;                                             \
+      databox##k.bsize2   = 0;                                             \
+   }
+
+/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */
+#define hypre_newBoxLoopDeclare(box)                     \
+   hypre_Index local_idx;                                \
+   size_t idx_local = item.get_local_id(0);                            \
+   hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
+   idx_local = idx_local / box.lsize0;                   \
+   hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
+   idx_local = idx_local / box.lsize1;                   \
+   hypre_IndexD(local_idx, 2)  = idx_local % box.lsize2; \
+
+/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */
+#define hypre_BoxLoopIncK(k, box, hypre__i)                                               \
+   HYPRE_Int hypre_boxD##k = 1;                                                           \
+   HYPRE_Int hypre__i = 0;                                                                \
+   hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);                                         \
+   hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);                                         \
+   hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);
+
+
+
+/* BoxLoop 1 */
+#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                             \
+   {                                                                                                  \
+      hypre_newBoxLoopDeclare(databox1);                                                              \
+      hypre_BoxLoopIncK(1, databox1, i1);
+
+#define hypre_newBoxLoop1End(i1)                                                                      \
+   }, hypre__tot);                                                                                                \
+}
+
+#define my_hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define my_hypre_BoxLoop1End        hypre_newBoxLoop1End
+
+HYPRE_Int
+my_hypre_StructVectorSetConstantValues( hypre_StructVector *vector,
+                                     HYPRE_Complex       values )
+{
+   hypre_Box          *v_data_box;
+
+   HYPRE_Complex      *vp;
+
+   hypre_BoxArray     *boxes;
+   hypre_Box          *box;
+   hypre_Index         loop_size;
+   hypre_IndexRef      start;
+   hypre_Index         unit_stride;
+
+   HYPRE_Int           i;
+
+   /*-----------------------------------------------------------------------
+    * Set the vector coefficients
+    *-----------------------------------------------------------------------*/
+
+   hypre_SetIndex(unit_stride, 1);
+
+   boxes = hypre_StructGridBoxes(hypre_StructVectorGrid(vector));
+   hypre_ForBoxI(i, boxes)
+   {
+      box      = hypre_BoxArrayBox(boxes, i);
+      start = hypre_BoxIMin(box);
+
+      v_data_box =
+         hypre_BoxArrayBox(hypre_StructVectorDataSpace(vector), i);
+      vp = hypre_StructVectorBoxData(vector, i);
+
+      hypre_BoxGetSize(box, loop_size);
+
+      // WM: question - What's DEVICE_VAR?
+#define DEVICE_VAR is_device_ptr(vp)
+      my_hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
+                          v_data_box, start, unit_stride, vi);
+      {
+         vp[vi] = values;
+      }
+      my_hypre_BoxLoop1End(vi);
+#undef DEVICE_VAR
+   }
+
+   return hypre_error_flag;
+}
+
+HYPRE_Int
+my_hypre_StructAxpy( HYPRE_Complex       alpha,
+                     hypre_StructVector *x,
+                     hypre_StructVector *y     )
+{
+   hypre_Box        *x_data_box;
+   hypre_Box        *y_data_box;
+
+   HYPRE_Complex    *xp;
+   HYPRE_Complex    *yp;
+
+   hypre_BoxArray   *boxes;
+   hypre_Box        *box;
+   hypre_Index       loop_size;
+   hypre_IndexRef    start;
+   hypre_Index       unit_stride;
+
+   HYPRE_Int         i;
+
+   hypre_SetIndex(unit_stride, 1);
+
+   boxes = hypre_StructGridBoxes(hypre_StructVectorGrid(y));
+   hypre_ForBoxI(i, boxes)
+   {
+      box   = hypre_BoxArrayBox(boxes, i);
+      start = hypre_BoxIMin(box);
+
+      x_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(x), i);
+      y_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(y), i);
+
+      xp = hypre_StructVectorBoxData(x, i);
+      yp = hypre_StructVectorBoxData(y, i);
+
+      hypre_BoxGetSize(box, loop_size);
+
+/* WM: what is the DEVICE_VAR thing? */
+#define DEVICE_VAR is_device_ptr(yp,xp)
+      /* WM: todo */
+      /* my_hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size, */
+      /*                     x_data_box, start, unit_stride, xi, */
+      /*                     y_data_box, start, unit_stride, yi); */
+      /* { */
+      /*    yp[yi] += alpha * xp[xi]; */
+      /* } */
+      /* my_hypre_BoxLoop2End(xi, yi); */
+#undef DEVICE_VAR
+   }
+
+   return hypre_error_flag;
+}
+
+
+/****************************
+ * main
+ ****************************/
+
+hypre_int
+main( hypre_int argc,
+      char *argv[] )
+{
+   /* variables */
+   HYPRE_Int           i, ix, iy, iz, ib;
+   HYPRE_Int           p, q, r;
+   HYPRE_Int           nx, ny, nz;
+   HYPRE_Int           bx, by, bz;
+   HYPRE_Int           nblocks;
+   HYPRE_Int           dim;
+   HYPRE_Int           sym;
+   HYPRE_Int         **offsets;
+   HYPRE_Int         **iupper;
+   HYPRE_Int         **ilower;
+   HYPRE_Int           periodic[3];
+   HYPRE_Int           istart[3];
+   HYPRE_StructGrid    grid;
+   HYPRE_StructVector  b;
+   HYPRE_StructVector  x;
+   HYPRE_Int           num_ghost[6]   = {0, 0, 0, 0, 0, 0};
+
+   dim = 1;
+   sym  = 1;
+   nx = 1000;
+   ny = 1;
+   nz = 1;
+   bx = 1;
+   by = 1;
+   bz = 1;
+   p = 1;
+   q = 1;
+   r = 1;
+   periodic[0] = 0;
+   periodic[1] = 0;
+   periodic[2] = 0;
+   istart[0] = -3;
+   istart[1] = -3;
+   istart[2] = -3;
+
+   for (i = 0; i < 2*dim; i++)
+   {
+      num_ghost[i]   = 1;
+   }
+
+   switch (dim)
+   {
+      case 1:
+         nblocks = bx;
+         if(sym)
+         {
+            offsets = hypre_CTAlloc(HYPRE_Int*,  2, HYPRE_MEMORY_HOST);
+            offsets[0] = hypre_CTAlloc(HYPRE_Int,  1, HYPRE_MEMORY_HOST);
+            offsets[0][0] = -1;
+            offsets[1] = hypre_CTAlloc(HYPRE_Int,  1, HYPRE_MEMORY_HOST);
+            offsets[1][0] = 0;
+         }
+         else
+         {
+            offsets = hypre_CTAlloc(HYPRE_Int*,  3, HYPRE_MEMORY_HOST);
+            offsets[0] = hypre_CTAlloc(HYPRE_Int,  1, HYPRE_MEMORY_HOST);
+            offsets[0][0] = -1;
+            offsets[1] = hypre_CTAlloc(HYPRE_Int,  1, HYPRE_MEMORY_HOST);
+            offsets[1][0] = 0;
+            offsets[2] = hypre_CTAlloc(HYPRE_Int,  1, HYPRE_MEMORY_HOST);
+            offsets[2][0] = 1;
+         }
+         break;
+
+      case 2:
+         nblocks = bx*by;
+         if(sym)
+         {
+            offsets = hypre_CTAlloc(HYPRE_Int*,  3, HYPRE_MEMORY_HOST);
+            offsets[0] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
+            offsets[0][0] = -1;
+            offsets[0][1] = 0;
+            offsets[1] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
+            offsets[1][0] = 0;
+            offsets[1][1] = -1;
+            offsets[2] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
+            offsets[2][0] = 0;
+            offsets[2][1] = 0;
+         }
+         else
+         {
+            offsets = hypre_CTAlloc(HYPRE_Int*,  5, HYPRE_MEMORY_HOST);
+            offsets[0] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
+            offsets[0][0] = -1;
+            offsets[0][1] = 0;
+            offsets[1] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
+            offsets[1][0] = 0;
+            offsets[1][1] = -1;
+            offsets[2] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
+            offsets[2][0] = 0;
+            offsets[2][1] = 0;
+            offsets[3] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
+            offsets[3][0] = 1;
+            offsets[3][1] = 0;
+            offsets[4] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
+            offsets[4][0] = 0;
+            offsets[4][1] = 1;
+         }
+         break;
+
+      case 3:
+         nblocks = bx*by*bz;
+         if(sym)
+         {
+            offsets = hypre_CTAlloc(HYPRE_Int*,  4, HYPRE_MEMORY_HOST);
+            offsets[0] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
+            offsets[0][0] = -1;
+            offsets[0][1] = 0;
+            offsets[0][2] = 0;
+            offsets[1] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
+            offsets[1][0] = 0;
+            offsets[1][1] = -1;
+            offsets[1][2] = 0;
+            offsets[2] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
+            offsets[2][0] = 0;
+            offsets[2][1] = 0;
+            offsets[2][2] = -1;
+            offsets[3] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
+            offsets[3][0] = 0;
+            offsets[3][1] = 0;
+            offsets[3][2] = 0;
+         }
+         else
+         {
+            offsets = hypre_CTAlloc(HYPRE_Int*,  7, HYPRE_MEMORY_HOST);
+            offsets[0] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
+            offsets[0][0] = -1;
+            offsets[0][1] = 0;
+            offsets[0][2] = 0;
+            offsets[1] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
+            offsets[1][0] = 0;
+            offsets[1][1] = -1;
+            offsets[1][2] = 0;
+            offsets[2] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
+            offsets[2][0] = 0;
+            offsets[2][1] = 0;
+            offsets[2][2] = -1;
+            offsets[3] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
+            offsets[3][0] = 0;
+            offsets[3][1] = 0;
+            offsets[3][2] = 0;
+            offsets[4] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
+            offsets[4][0] = 1;
+            offsets[4][1] = 0;
+            offsets[4][2] = 0;
+            offsets[5] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
+            offsets[5][0] = 0;
+            offsets[5][1] = 1;
+            offsets[5][2] = 0;
+            offsets[6] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
+            offsets[6][0] = 0;
+            offsets[6][1] = 0;
+            offsets[6][2] = 1;
+         }
+         break;
+   }
+
+
+
+   /* initialize */
+   hypre_MPI_Init(&argc, &argv);
+   HYPRE_Init();
+
+    /* prepare space for the extents */
+   ilower = hypre_CTAlloc(HYPRE_Int*,  nblocks, HYPRE_MEMORY_HOST);
+   iupper = hypre_CTAlloc(HYPRE_Int*,  nblocks, HYPRE_MEMORY_HOST);
+   for (i = 0; i < nblocks; i++)
+   {
+      ilower[i] = hypre_CTAlloc(HYPRE_Int,  dim, HYPRE_MEMORY_HOST);
+      iupper[i] = hypre_CTAlloc(HYPRE_Int,  dim, HYPRE_MEMORY_HOST);
+   }
+
+   /* compute ilower and iupper from (p,q,r), (bx,by,bz), and (nx,ny,nz) */
+   ib = 0;
+   switch (dim)
+   {
+      case 1:
+         for (ix = 0; ix < bx; ix++)
+         {
+            ilower[ib][0] = istart[0]+ nx*(bx*p+ix);
+            iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1;
+            ib++;
+         }
+         break;
+      case 2:
+         for (iy = 0; iy < by; iy++)
+            for (ix = 0; ix < bx; ix++)
+            {
+               ilower[ib][0] = istart[0]+ nx*(bx*p+ix);
+               iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1;
+               ilower[ib][1] = istart[1]+ ny*(by*q+iy);
+               iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1;
+               ib++;
+            }
+         break;
+      case 3:
+         for (iz = 0; iz < bz; iz++)
+            for (iy = 0; iy < by; iy++)
+               for (ix = 0; ix < bx; ix++)
+               {
+                  ilower[ib][0] = istart[0]+ nx*(bx*p+ix);
+                  iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1;
+                  ilower[ib][1] = istart[1]+ ny*(by*q+iy);
+                  iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1;
+                  ilower[ib][2] = istart[2]+ nz*(bz*r+iz);
+                  iupper[ib][2] = istart[2]+ nz*(bz*r+iz+1) - 1;
+                  ib++;
+               }
+         break;
+   }
+   /* create grid */
+   HYPRE_StructGridCreate(hypre_MPI_COMM_WORLD, dim, &grid);
+   for (ib = 0; ib < nblocks; ib++)
+   {
+      /* Add to the grid a new box defined by ilower[ib], iupper[ib]...*/
+      HYPRE_StructGridSetExtents(grid, ilower[ib], iupper[ib]);
+   }
+   HYPRE_StructGridSetPeriodic(grid, periodic);
+   HYPRE_StructGridSetNumGhost(grid, num_ghost);
+   HYPRE_StructGridAssemble(grid);
+
+   /* create struct vectors */
+   HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &b);
+   HYPRE_StructVectorInitialize(b);
+   AddValuesVector(grid,b,periodic,1.0);
+   HYPRE_StructVectorAssemble(b);
+
+   HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &x);
+   HYPRE_StructVectorInitialize(x);
+   AddValuesVector(grid,x,periodic,1.0);
+   HYPRE_StructVectorAssemble(x);
+
+
+   /* call set const */
+   my_hypre_StructVectorSetConstantValues(x, 1.0);
+
+   /* call axpy */
+   /* my_hypre_StructAxpy(1.0, x, b); */
+
+
+
+
+
+
+
+   hypre_printf("DONE\n");
+   return 0;
+}
+
+HYPRE_Int
+AddValuesVector( hypre_StructGrid  *gridvector,
+                 hypre_StructVector *zvector,
+                 HYPRE_Int          *period,
+                 HYPRE_Real         value  )
+{
+/* #include  "_hypre_struct_mv.h" */
+   HYPRE_Int ierr = 0;
+   hypre_BoxArray     *gridboxes;
+   HYPRE_Int          ib;
+   hypre_IndexRef     ilower;
+   hypre_IndexRef     iupper;
+   hypre_Box          *box;
+   HYPRE_Real         *values;
+   HYPRE_Int          volume,dim;
+#if 0 //defined(HYPRE_USING_CUDA)
+   HYPRE_Int          data_location = hypre_StructGridDataLocation(hypre_StructVectorGrid(zvector));
+#endif
+
+   gridboxes =  hypre_StructGridBoxes(gridvector);
+   dim       =  hypre_StructGridNDim(gridvector);
+
+   ib=0;
+   hypre_ForBoxI(ib, gridboxes)
+   {
+      box      = hypre_BoxArrayBox(gridboxes, ib);
+      volume   =  hypre_BoxVolume(box);
+#if 0 //defined(HYPRE_USING_CUDA)
+      if (data_location != HYPRE_MEMORY_HOST)
+      {
+         values   = hypre_CTAlloc(HYPRE_Real, volume,HYPRE_MEMORY_DEVICE);
+      }
+      else
+      {
+         values   = hypre_CTAlloc(HYPRE_Real, volume,HYPRE_MEMORY_HOST);
+      }
+#else
+      values   = hypre_CTAlloc(HYPRE_Real, volume,HYPRE_MEMORY_DEVICE);
+#endif
+      /*-----------------------------------------------------------
+       * For periodic b.c. in all directions, need rhs to satisfy
+       * compatibility condition. Achieved by setting a source and
+       *  sink of equal strength.  All other problems have rhs = 1.
+       *-----------------------------------------------------------*/
+
+#define DEVICE_VAR is_device_ptr(values)
+      if ((dim == 2 && period[0] != 0 && period[1] != 0) ||
+          (dim == 3 && period[0] != 0 && period[1] != 0 && period[2] != 0))
+      {
+         hypre_LoopBegin(volume,i)
+         {
+            values[i] = 0.0;
+            values[0]         =  value;
+            values[volume - 1] = -value;
+
+         }
+         hypre_LoopEnd()
+      }
+      else
+      {
+         hypre_LoopBegin(volume,i)
+         {
+            values[i] = value;
+         }
+         hypre_LoopEnd()
+      }
+#undef DEVICE_VAR
+
+      ilower = hypre_BoxIMin(box);
+      iupper = hypre_BoxIMax(box);
+
+      HYPRE_StructVectorSetBoxValues(zvector, ilower, iupper, values);
+
+#if 0 //defined(HYPRE_USING_CUDA)
+      if (data_location != HYPRE_MEMORY_HOST)
+      {
+          hypre_TFree(values,HYPRE_MEMORY_DEVICE);
+      }
+      else
+      {
+          hypre_TFree(values,HYPRE_MEMORY_HOST);
+      }
+#else
+      hypre_TFree(values,HYPRE_MEMORY_DEVICE);
+#endif
+   }
+
+   return ierr;
+}

From 0c58ebe19bbbb0e256fb6cb969a45d1c111d8318 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Mon, 27 Sep 2021 16:00:54 -0700
Subject: [PATCH 08/44] Reproducing invalid kernel name error in simple

---
 src/struct_mv/_hypre_struct_mv.hpp | 175 ++++++++++++++++++++++-
 src/struct_mv/boxloop_sycl.h       | 175 ++++++++++++++++++++++-
 src/test/simple.c                  | 221 +++++++----------------------
 src/utilities/device_utils.c       |  15 +-
 4 files changed, 406 insertions(+), 180 deletions(-)

diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index beaed26fda..0cc8ba2619 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1179,6 +1179,169 @@ typedef struct hypre_Boxloop_struct
 } hypre_Boxloop;
 
 
+#ifdef __cplusplus
+extern "C++" {
+#endif
+
+/*********************************************************************
+ * forall function
+ *********************************************************************/
+
+template<typename LOOP_BODY>
+void
+BoxLoopforall( LOOP_BODY loop_body,
+               HYPRE_Int length )
+{
+   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
+   /* WM: TODO: uncomment above and remove below */
+   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
+
+   if (exec_policy == HYPRE_EXEC_HOST)
+   {
+/* WM: todo - is this really necessary, even? */
+/* #ifdef HYPRE_USING_OPENMP */
+/* #pragma omp parallel for HYPRE_SMP_SCHEDULE */
+/* #endif */
+/*       for (HYPRE_Int idx = 0; idx < length; idx++) */
+/*       { */
+/*          loop_body(idx); */
+/*       } */
+   }
+   else if (exec_policy == HYPRE_EXEC_DEVICE)
+   {
+      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
+      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+
+      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
+         {
+            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body);
+         }).wait_and_throw();
+   }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+/*********************************************************************
+ * Init/Declare/IncK etc.
+ *********************************************************************/
+
+/* Get 1-D length of the loop, in hypre__tot */
+#define hypre_newBoxLoopInit(ndim, loop_size)              \
+   HYPRE_Int hypre__tot = 1;                               \
+   for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \
+   {                                                       \
+      hypre__tot *= loop_size[hypre_d];                    \
+   }
+
+/* Initialize struct for box-k */
+#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \
+   hypre_Boxloop databox##k;                                               \
+   databox##k.lsize0   = loop_size[0];                                     \
+   databox##k.strides0 = stride[0];                                        \
+   databox##k.bstart0  = start[0] - dbox->imin[0];                         \
+   databox##k.bsize0   = dbox->imax[0] - dbox->imin[0];                    \
+   if (ndim > 1)                                                           \
+   {                                                                       \
+      databox##k.lsize1   = loop_size[1];                                  \
+      databox##k.strides1 = stride[1];                                     \
+      databox##k.bstart1  = start[1] - dbox->imin[1];                      \
+      databox##k.bsize1   = dbox->imax[1] - dbox->imin[1];                 \
+   }                                                                       \
+   else                                                                    \
+   {                                                                       \
+      databox##k.lsize1   = 1;                                             \
+      databox##k.strides1 = 0;                                             \
+      databox##k.bstart1  = 0;                                             \
+      databox##k.bsize1   = 0;                                             \
+   }                                                                       \
+   if (ndim == 3)                                                          \
+   {                                                                       \
+      databox##k.lsize2   = loop_size[2];                                  \
+      databox##k.strides2 = stride[2];                                     \
+      databox##k.bstart2  = start[2] - dbox->imin[2];                      \
+      databox##k.bsize2   = dbox->imax[2] - dbox->imin[2];                 \
+   }                                                                       \
+   else                                                                    \
+   {                                                                       \
+      databox##k.lsize2   = 1;                                             \
+      databox##k.strides2 = 0;                                             \
+      databox##k.bstart2  = 0;                                             \
+      databox##k.bsize2   = 0;                                             \
+   }
+
+/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */
+/* WM: todo - double check that item.get_local_id(0) is actually what you want below */
+#define hypre_newBoxLoopDeclare(box)                     \
+   hypre_Index local_idx;                                \
+   size_t idx_local = item.get_local_id(0);              \
+   hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
+   idx_local = idx_local / box.lsize0;                   \
+   hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
+   idx_local = idx_local / box.lsize1;                   \
+   hypre_IndexD(local_idx, 2)  = idx_local % box.lsize2; \
+
+/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */
+#define hypre_BoxLoopIncK(k, box, hypre__i)                                               \
+   HYPRE_Int hypre_boxD##k = 1;                                                           \
+   HYPRE_Int hypre__i = 0;                                                                \
+   hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);                                         \
+   hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);                                         \
+   hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);
+
+
+
+/* BoxLoop 1 */
+/* #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \ */
+/* {                                                                                                     \ */
+/*    hypre_newBoxLoopInit(ndim, loop_size);                                                             \ */
+/*    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \ */
+/*    hypre_printf("about to call BoxLoopfoall\n");                                                      \ */
+/*    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \ */
+/*    {                                                                                                  \ */
+/*       hypre_newBoxLoopDeclare(databox1);                                                              \ */
+/*       hypre_BoxLoopIncK(1, databox1, i1); */
+
+/* #define hypre_newBoxLoop1End(i1)                                                                      \ */
+/*    }, hypre__tot);                                                                                    \ */
+/* } */
+
+
+
+
+
+
+/* BoxLoop 1 */
+/* without the extra function call to BoxLoopforall */
+#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();                                  \
+   const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(hypre__tot, "thread", bDim);         \
+   hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)                         \
+   {                                                                                                  \
+      cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), [=] (sycl::nd_item<1> item)                \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);
+
+#define hypre_newBoxLoop1End(i1)                                                                      \
+      });                                                                                             \
+   }).wait_and_throw();                                                                               \
+}
+
+
+
+
+
+#define my_hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define my_hypre_BoxLoop1End        hypre_newBoxLoop1End
+
 
 
 
@@ -1429,8 +1592,11 @@ typedef struct hypre_Boxloop_struct
 #define hypre_BoxLoopBlock       zypre_BoxLoopBlock
 #define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
 #define hypre_BoxLoop0End        zypre_newBoxLoop0End
-#define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin
-#define hypre_BoxLoop1End        zypre_newBoxLoop1End
+/* WM: replacing boxloops one at a time starting with boxloop1 */
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+/* #define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin */
+/* #define hypre_BoxLoop1End        zypre_newBoxLoop1End */
 #define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
 #define hypre_BoxLoop2End        zypre_newBoxLoop2End
 #define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin
@@ -1440,11 +1606,12 @@ typedef struct hypre_Boxloop_struct
 #define hypre_BasicBoxLoop2Begin zypre_newBasicBoxLoop2Begin
 
 /* Reduction */
+/* WM: todo */
 #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \
-        hypre_BoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)
+        zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)
 
 #define hypre_BoxLoop1ReductionEnd(i1, reducesum) \
-        hypre_BoxLoop1End(i1)
+        zypre_newBoxLoop1End(i1)
 
 #define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \
                                                       dbox2, start2, stride2, i2, reducesum) \
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index 0804d42fb7..f4dbd9eb63 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -27,6 +27,169 @@ typedef struct hypre_Boxloop_struct
 } hypre_Boxloop;
 
 
+#ifdef __cplusplus
+extern "C++" {
+#endif
+
+/*********************************************************************
+ * forall function
+ *********************************************************************/
+
+template<typename LOOP_BODY>
+void
+BoxLoopforall( LOOP_BODY loop_body,
+               HYPRE_Int length )
+{
+   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
+   /* WM: TODO: uncomment above and remove below */
+   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
+
+   if (exec_policy == HYPRE_EXEC_HOST)
+   {
+/* WM: todo - is this really necessary, even? */
+/* #ifdef HYPRE_USING_OPENMP */
+/* #pragma omp parallel for HYPRE_SMP_SCHEDULE */
+/* #endif */
+/*       for (HYPRE_Int idx = 0; idx < length; idx++) */
+/*       { */
+/*          loop_body(idx); */
+/*       } */
+   }
+   else if (exec_policy == HYPRE_EXEC_DEVICE)
+   {
+      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
+      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+
+      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
+         {
+            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body);
+         }).wait_and_throw();
+   }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+/*********************************************************************
+ * Init/Declare/IncK etc.
+ *********************************************************************/
+
+/* Get 1-D length of the loop, in hypre__tot */
+#define hypre_newBoxLoopInit(ndim, loop_size)              \
+   HYPRE_Int hypre__tot = 1;                               \
+   for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \
+   {                                                       \
+      hypre__tot *= loop_size[hypre_d];                    \
+   }
+
+/* Initialize struct for box-k */
+#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \
+   hypre_Boxloop databox##k;                                               \
+   databox##k.lsize0   = loop_size[0];                                     \
+   databox##k.strides0 = stride[0];                                        \
+   databox##k.bstart0  = start[0] - dbox->imin[0];                         \
+   databox##k.bsize0   = dbox->imax[0] - dbox->imin[0];                    \
+   if (ndim > 1)                                                           \
+   {                                                                       \
+      databox##k.lsize1   = loop_size[1];                                  \
+      databox##k.strides1 = stride[1];                                     \
+      databox##k.bstart1  = start[1] - dbox->imin[1];                      \
+      databox##k.bsize1   = dbox->imax[1] - dbox->imin[1];                 \
+   }                                                                       \
+   else                                                                    \
+   {                                                                       \
+      databox##k.lsize1   = 1;                                             \
+      databox##k.strides1 = 0;                                             \
+      databox##k.bstart1  = 0;                                             \
+      databox##k.bsize1   = 0;                                             \
+   }                                                                       \
+   if (ndim == 3)                                                          \
+   {                                                                       \
+      databox##k.lsize2   = loop_size[2];                                  \
+      databox##k.strides2 = stride[2];                                     \
+      databox##k.bstart2  = start[2] - dbox->imin[2];                      \
+      databox##k.bsize2   = dbox->imax[2] - dbox->imin[2];                 \
+   }                                                                       \
+   else                                                                    \
+   {                                                                       \
+      databox##k.lsize2   = 1;                                             \
+      databox##k.strides2 = 0;                                             \
+      databox##k.bstart2  = 0;                                             \
+      databox##k.bsize2   = 0;                                             \
+   }
+
+/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */
+/* WM: todo - double check that item.get_local_id(0) is actually what you want below */
+#define hypre_newBoxLoopDeclare(box)                     \
+   hypre_Index local_idx;                                \
+   size_t idx_local = item.get_local_id(0);              \
+   hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
+   idx_local = idx_local / box.lsize0;                   \
+   hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
+   idx_local = idx_local / box.lsize1;                   \
+   hypre_IndexD(local_idx, 2)  = idx_local % box.lsize2; \
+
+/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */
+#define hypre_BoxLoopIncK(k, box, hypre__i)                                               \
+   HYPRE_Int hypre_boxD##k = 1;                                                           \
+   HYPRE_Int hypre__i = 0;                                                                \
+   hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);                                         \
+   hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);                                         \
+   hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \
+   hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);
+
+
+
+/* BoxLoop 1 */
+/* #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \ */
+/* {                                                                                                     \ */
+/*    hypre_newBoxLoopInit(ndim, loop_size);                                                             \ */
+/*    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \ */
+/*    hypre_printf("about to call BoxLoopfoall\n");                                                      \ */
+/*    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \ */
+/*    {                                                                                                  \ */
+/*       hypre_newBoxLoopDeclare(databox1);                                                              \ */
+/*       hypre_BoxLoopIncK(1, databox1, i1); */
+
+/* #define hypre_newBoxLoop1End(i1)                                                                      \ */
+/*    }, hypre__tot);                                                                                    \ */
+/* } */
+
+
+
+
+
+
+/* BoxLoop 1 */
+/* without the extra function call to BoxLoopforall */
+#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();                                  \
+   const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(hypre__tot, "thread", bDim);         \
+   hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)                         \
+   {                                                                                                  \
+      cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), [=] (sycl::nd_item<1> item)                \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);
+
+#define hypre_newBoxLoop1End(i1)                                                                      \
+      });                                                                                             \
+   }).wait_and_throw();                                                                               \
+}
+
+
+
+
+
+#define my_hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define my_hypre_BoxLoop1End        hypre_newBoxLoop1End
+
 
 
 
@@ -277,8 +440,11 @@ typedef struct hypre_Boxloop_struct
 #define hypre_BoxLoopBlock       zypre_BoxLoopBlock
 #define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
 #define hypre_BoxLoop0End        zypre_newBoxLoop0End
-#define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin
-#define hypre_BoxLoop1End        zypre_newBoxLoop1End
+/* WM: replacing boxloops one at a time starting with boxloop1 */
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+/* #define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin */
+/* #define hypre_BoxLoop1End        zypre_newBoxLoop1End */
 #define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
 #define hypre_BoxLoop2End        zypre_newBoxLoop2End
 #define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin
@@ -288,11 +454,12 @@ typedef struct hypre_Boxloop_struct
 #define hypre_BasicBoxLoop2Begin zypre_newBasicBoxLoop2Begin
 
 /* Reduction */
+/* WM: todo */
 #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \
-        hypre_BoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)
+        zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)
 
 #define hypre_BoxLoop1ReductionEnd(i1, reducesum) \
-        hypre_BoxLoop1End(i1)
+        zypre_newBoxLoop1End(i1)
 
 #define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \
                                                       dbox2, start2, stride2, i2, reducesum) \
diff --git a/src/test/simple.c b/src/test/simple.c
index 0649fef677..e8a953beed 100644
--- a/src/test/simple.c
+++ b/src/test/simple.c
@@ -13,166 +13,6 @@ HYPRE_Int AddValuesVector( hypre_StructGrid  *gridvector,
 
 
 
-/*********************************************************************
- * put this in _hypre_utilities.hpp ? 
- * WM: todo - if you can wrap the basic parallel_for call for use elsewhere...
- *********************************************************************/
-/* #define HYPRE_SYCL_1D_LAUNCH(kernel_name, gridsize, blocksize, ...)                                                  \ */
-/* {                                                                                                                    \ */
-/*    if ( gridsize[0] == 0 || blocksize[0] == 0 )                                                                      \ */
-/*    {                                                                                                                 \ */
-/*       hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n",                          \ */
-/*                    __FILE__, __LINE__,                                                                               \ */
-/*                    gridsize[0], blocksize[0]);                                                                       \ */
-/*       assert(0); exit(1);                                                                                            \ */
-/*    }                                                                                                                 \ */
-/*    else                                                                                                              \ */
-/*    {                                                                                                                 \ */
-/*       hypre_printf("WM: debug - inside BoxLoopforall(), submitting to queue\n");                                                \ */
-/*       hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)                                     \ */
-/*          {                                                                                                           \ */
-/*             cgh.parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), [=] (sycl::nd_item<1> item)           \ */
-/*             { (kernel_name)(item, __VA_ARGS__); } );                                                                 \ */
-/*          }).wait_and_throw();                                                                                        \ */
-/*    }                                                                                                                 \ */
-/* } */
-
-
-
-#ifdef __cplusplus
-extern "C++" {
-#endif
-
-/*********************************************************************
- * forall function
- *********************************************************************/
-
-template<typename LOOP_BODY>
-void
-BoxLoopforall( LOOP_BODY loop_body,
-               HYPRE_Int length )
-{
-   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
-   /* WM: TODO: uncomment above and remove below */
-   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
-
-   if (exec_policy == HYPRE_EXEC_HOST)
-   {
-/* WM: todo - is this really necessary, even? */
-/* #ifdef HYPRE_USING_OPENMP */
-/* #pragma omp parallel for HYPRE_SMP_SCHEDULE */
-/* #endif */
-/*       for (HYPRE_Int idx = 0; idx < length; idx++) */
-/*       { */
-/*          loop_body(idx); */
-/*       } */
-   }
-   else if (exec_policy == HYPRE_EXEC_DEVICE)
-   {
-      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
-      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
-
-      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
-         {
-            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body);
-         }).wait_and_throw();
-   }
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-/*********************************************************************
- * Init/Declare/IncK etc.
- *********************************************************************/
-
-/* Get 1-D length of the loop, in hypre__tot */
-#define hypre_newBoxLoopInit(ndim, loop_size)              \
-   HYPRE_Int hypre__tot = 1;                               \
-   for (HYPRE_Int hypre_d = 0; hypre_d < ndim; hypre_d ++) \
-   {                                                       \
-      hypre__tot *= loop_size[hypre_d];                    \
-   }
-
-/* Initialize struct for box-k */
-#define hypre_BoxLoopDataDeclareK(k, ndim, loop_size, dbox, start, stride) \
-   hypre_Boxloop databox##k;                                               \
-   /* dim 0 */                                                             \
-   databox##k.lsize0   = loop_size[0];                                     \
-   databox##k.strides0 = stride[0];                                        \
-   databox##k.bstart0  = start[0] - dbox->imin[0];                         \
-   databox##k.bsize0   = dbox->imax[0] - dbox->imin[0];                    \
-   /* dim 1 */                                                             \
-   if (ndim > 1)                                                           \
-   {                                                                       \
-      databox##k.lsize1   = loop_size[1];                                  \
-      databox##k.strides1 = stride[1];                                     \
-      databox##k.bstart1  = start[1] - dbox->imin[1];                      \
-      databox##k.bsize1   = dbox->imax[1] - dbox->imin[1];                 \
-   }                                                                       \
-   else                                                                    \
-   {                                                                       \
-      databox##k.lsize1   = 1;                                             \
-      databox##k.strides1 = 0;                                             \
-      databox##k.bstart1  = 0;                                             \
-      databox##k.bsize1   = 0;                                             \
-   }                                                                       \
-   /* dim 2 */                                                             \
-   if (ndim == 3)                                                          \
-   {                                                                       \
-      databox##k.lsize2   = loop_size[2];                                  \
-      databox##k.strides2 = stride[2];                                     \
-      databox##k.bstart2  = start[2] - dbox->imin[2];                      \
-      databox##k.bsize2   = dbox->imax[2] - dbox->imin[2];                 \
-   }                                                                       \
-   else                                                                    \
-   {                                                                       \
-      databox##k.lsize2   = 1;                                             \
-      databox##k.strides2 = 0;                                             \
-      databox##k.bstart2  = 0;                                             \
-      databox##k.bsize2   = 0;                                             \
-   }
-
-/* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */
-#define hypre_newBoxLoopDeclare(box)                     \
-   hypre_Index local_idx;                                \
-   size_t idx_local = item.get_local_id(0);                            \
-   hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
-   idx_local = idx_local / box.lsize0;                   \
-   hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
-   idx_local = idx_local / box.lsize1;                   \
-   hypre_IndexD(local_idx, 2)  = idx_local % box.lsize2; \
-
-/* Given input 3-D 'local_idx', get 1-D 'hypre__i' in 'box' */
-#define hypre_BoxLoopIncK(k, box, hypre__i)                                               \
-   HYPRE_Int hypre_boxD##k = 1;                                                           \
-   HYPRE_Int hypre__i = 0;                                                                \
-   hypre__i += (hypre_IndexD(local_idx, 0) * box.strides0 + box.bstart0) * hypre_boxD##k; \
-   hypre_boxD##k *= hypre_max(0, box.bsize0 + 1);                                         \
-   hypre__i += (hypre_IndexD(local_idx, 1) * box.strides1 + box.bstart1) * hypre_boxD##k; \
-   hypre_boxD##k *= hypre_max(0, box.bsize1 + 1);                                         \
-   hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \
-   hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);
-
-
-
-/* BoxLoop 1 */
-#define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
-{                                                                                                     \
-   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
-   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                             \
-   {                                                                                                  \
-      hypre_newBoxLoopDeclare(databox1);                                                              \
-      hypre_BoxLoopIncK(1, databox1, i1);
-
-#define hypre_newBoxLoop1End(i1)                                                                      \
-   }, hypre__tot);                                                                                                \
-}
-
-#define my_hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
-#define my_hypre_BoxLoop1End        hypre_newBoxLoop1End
 
 HYPRE_Int
 my_hypre_StructVectorSetConstantValues( hypre_StructVector *vector,
@@ -208,14 +48,13 @@ my_hypre_StructVectorSetConstantValues( hypre_StructVector *vector,
 
       hypre_BoxGetSize(box, loop_size);
 
-      // WM: question - What's DEVICE_VAR?
 #define DEVICE_VAR is_device_ptr(vp)
-      my_hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
+      hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                           v_data_box, start, unit_stride, vi);
       {
          vp[vi] = values;
       }
-      my_hypre_BoxLoop1End(vi);
+      hypre_BoxLoop1End(vi);
 #undef DEVICE_VAR
    }
 
@@ -274,6 +113,29 @@ my_hypre_StructAxpy( HYPRE_Complex       alpha,
 }
 
 
+/****************************
+ * show device function copied from oneAPI examples
+ ****************************/
+#include <iomanip>
+#include "dpc_common.hpp"
+
+void ShowDevice(sycl::queue &q) {
+  using namespace std;
+  using namespace sycl;
+  // Output platform and device information.
+  auto device = q.get_device();
+  auto p_name = device.get_platform().get_info<info::platform::name>();
+  cout << std::setw(20) << "Platform Name: " << p_name << "\n";
+  auto p_version = device.get_platform().get_info<info::platform::version>();
+  cout << std::setw(20) << "Platform Version: " << p_version << "\n";
+  auto d_name = device.get_info<info::device::name>();
+  cout << std::setw(20) << "Device Name: " << d_name << "\n";
+  auto max_work_group = device.get_info<info::device::max_work_group_size>();
+  cout << std::setw(20) << "Max Work Group: " << max_work_group << "\n";
+  auto max_compute_units = device.get_info<info::device::max_compute_units>();
+  cout << std::setw(20) << "Max Compute Units: " << max_compute_units << "\n\n";
+}
+
 /****************************
  * main
  ****************************/
@@ -282,6 +144,27 @@ hypre_int
 main( hypre_int argc,
       char *argv[] )
 {
+
+   /* initialize */
+   /* hypre_MPI_Init(&argc, &argv); */
+   /* HYPRE_Init(); */
+   /* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); */
+
+
+   /* sycl::queue my_queue(sycl::default_selector{}, dpc_common::exception_handler); */
+   /* ShowDevice(my_queue); */
+
+   /* sycl::device gpu = sycl::device(sycl::cpu_selector{}); */
+   /* sycl::device dev; */
+   /* hypre_printf("is_host = %d\n", gpu.is_host()); */
+   /* hypre_printf("is_cpu = %d\n", gpu.is_cpu()); */
+   /* hypre_printf("is_cpu = %d\n", dev.is_cpu()); */
+   /* hypre_printf("is_gpu = %d\n", gpu.is_gpu()); */
+   /* hypre_printf("DONE\n"); */
+   /* exit(0); */
+
+
+
    /* variables */
    HYPRE_Int           i, ix, iy, iz, ib;
    HYPRE_Int           p, q, r;
@@ -300,10 +183,10 @@ main( hypre_int argc,
    HYPRE_StructVector  x;
    HYPRE_Int           num_ghost[6]   = {0, 0, 0, 0, 0, 0};
 
-   dim = 1;
+   dim = 2;
    sym  = 1;
-   nx = 1000;
-   ny = 1;
+   nx = 10;
+   ny = 10;
    nz = 1;
    bx = 1;
    by = 1;
@@ -517,7 +400,11 @@ main( hypre_int argc,
 
 
    /* call set const */
-   my_hypre_StructVectorSetConstantValues(x, 1.0);
+   my_hypre_StructVectorSetConstantValues(x, 5.0);
+   hypre_printf("my_hypre_StructVectorSetConstantValues() success!\n");
+
+   hypre_StructVectorSetConstantValues(x, 5.0);
+   hypre_printf("hypre_StructVectorSetConstantValues() success!\n");
 
    /* call axpy */
    /* my_hypre_StructAxpy(1.0, x, b); */
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index f5dbdc07a1..78a136eeed 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -963,9 +963,11 @@ hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i)
          }
       };
 
-      sycl::device   syclDev   = data->device;
-      sycl::context  syclctxt  = sycl::context(syclDev, sycl_asynchandler);
-      stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}});
+      /* WM: having trouble with getting the device on frank, so temporarily just passing the default selector */
+      /* sycl::device   syclDev   = data->device; */
+      /* sycl::context  syclctxt  = sycl::context(syclDev, sycl_asynchandler); */
+      /* stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); */
+      stream = new sycl::queue(sycl::default_selector{}, sycl::property_list{sycl::property::queue::in_order{}});
       data->streams[i] = stream;
    }
 #endif
@@ -1222,7 +1224,8 @@ hypre_DeviceDataCreate()
    hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST);
 
 #if defined(HYPRE_USING_SYCL)
-   hypre_DeviceDataDevice(data)            = sycl::device(sycl::gpu_selector{});
+   /* WM: commenting out for now since I'm having trouble finding the device on frank */
+   /* hypre_DeviceDataDevice(data)            = sycl::device(sycl::gpu_selector{}); */
 #else
    hypre_DeviceDataDevice(data)            = 0;
 #endif
@@ -1466,7 +1469,9 @@ hypre_bind_device( HYPRE_Int myid,
    hypre_MPI_Comm_free(&node_comm);
 
    /* get number of devices on this node */
-   hypre_GetDeviceCount(&nDevices);
+   /* WM: doesn't work on frank... commenting out */
+   /* hypre_GetDeviceCount(&nDevices); */
+   nDevices = 1;
 
    /* set device */
    device_id = myNodeid % nDevices;

From 5695c978a44b04d649f2deb126dd6143d0542e58 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Wed, 29 Sep 2021 16:19:28 -0700
Subject: [PATCH 09/44] boxloop1 running on frank

I have fixed my compilation issues and can now run with my
sycl boxloop1 implementation on frank's sever machine. The
boxloop1 code seems to be giving correct results as well,
though it seems somewhere along the line I screwed up the
struct solvers tests, which yield a discrepancy in number of
iterations for the first solvers.jobs job.
---
 src/struct_mv/_hypre_struct_mv.hpp | 60 ++++++------------------------
 src/struct_mv/boxloop_sycl.h       | 52 ++++----------------------
 src/test/Makefile                  |  3 +-
 src/test/simple.c                  | 19 ++++++----
 src/utilities/memory.c             |  7 ++++
 5 files changed, 39 insertions(+), 102 deletions(-)

diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index 0cc8ba2619..d1866014f1 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1209,6 +1209,7 @@ BoxLoopforall( LOOP_BODY loop_body,
    }
    else if (exec_policy == HYPRE_EXEC_DEVICE)
    {
+      /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */
       const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
       const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
 
@@ -1275,7 +1276,7 @@ BoxLoopforall( LOOP_BODY loop_body,
 /* WM: todo - double check that item.get_local_id(0) is actually what you want below */
 #define hypre_newBoxLoopDeclare(box)                     \
    hypre_Index local_idx;                                \
-   size_t idx_local = item.get_local_id(0);              \
+   HYPRE_Int idx_local = (HYPRE_Int) idx;              \
    hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
    idx_local = idx_local / box.lsize0;                   \
    hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
@@ -1296,66 +1297,27 @@ BoxLoopforall( LOOP_BODY loop_body,
 
 
 /* BoxLoop 1 */
-/* #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \ */
-/* {                                                                                                     \ */
-/*    hypre_newBoxLoopInit(ndim, loop_size);                                                             \ */
-/*    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \ */
-/*    hypre_printf("about to call BoxLoopfoall\n");                                                      \ */
-/*    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \ */
-/*    {                                                                                                  \ */
-/*       hypre_newBoxLoopDeclare(databox1);                                                              \ */
-/*       hypre_BoxLoopIncK(1, databox1, i1); */
-
-/* #define hypre_newBoxLoop1End(i1)                                                                      \ */
-/*    }, hypre__tot);                                                                                    \ */
-/* } */
-
-
-
-
-
-
-/* BoxLoop 1 */
-/* without the extra function call to BoxLoopforall */
 #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
 {                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();                                  \
-   const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(hypre__tot, "thread", bDim);         \
-   hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)                         \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), [=] (sycl::nd_item<1> item)                \
+      size_t idx = item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
          hypre_BoxLoopIncK(1, databox1, i1);
 
 #define hypre_newBoxLoop1End(i1)                                                                      \
-      });                                                                                             \
-   }).wait_and_throw();                                                                               \
+      }                                                                                               \
+   }, hypre__tot);                                                                                    \
 }
 
 
 
 
 
-#define my_hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
-#define my_hypre_BoxLoop1End        hypre_newBoxLoop1End
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 
 
 
@@ -1593,10 +1555,10 @@ BoxLoopforall( LOOP_BODY loop_body,
 #define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
 #define hypre_BoxLoop0End        zypre_newBoxLoop0End
 /* WM: replacing boxloops one at a time starting with boxloop1 */
-#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
-#define hypre_BoxLoop1End        hypre_newBoxLoop1End
-/* #define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin */
-/* #define hypre_BoxLoop1End        zypre_newBoxLoop1End */
+/* #define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin */
+/* #define hypre_BoxLoop1End        hypre_newBoxLoop1End */
+#define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin
+#define hypre_BoxLoop1End        zypre_newBoxLoop1End
 #define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
 #define hypre_BoxLoop2End        zypre_newBoxLoop2End
 #define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index f4dbd9eb63..e4ac919d90 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -57,6 +57,7 @@ BoxLoopforall( LOOP_BODY loop_body,
    }
    else if (exec_policy == HYPRE_EXEC_DEVICE)
    {
+      /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */
       const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
       const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
 
@@ -123,7 +124,7 @@ BoxLoopforall( LOOP_BODY loop_body,
 /* WM: todo - double check that item.get_local_id(0) is actually what you want below */
 #define hypre_newBoxLoopDeclare(box)                     \
    hypre_Index local_idx;                                \
-   size_t idx_local = item.get_local_id(0);              \
+   HYPRE_Int idx_local = (HYPRE_Int) idx;              \
    hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
    idx_local = idx_local / box.lsize0;                   \
    hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
@@ -144,66 +145,27 @@ BoxLoopforall( LOOP_BODY loop_body,
 
 
 /* BoxLoop 1 */
-/* #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \ */
-/* {                                                                                                     \ */
-/*    hypre_newBoxLoopInit(ndim, loop_size);                                                             \ */
-/*    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \ */
-/*    hypre_printf("about to call BoxLoopfoall\n");                                                      \ */
-/*    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \ */
-/*    {                                                                                                  \ */
-/*       hypre_newBoxLoopDeclare(databox1);                                                              \ */
-/*       hypre_BoxLoopIncK(1, databox1, i1); */
-
-/* #define hypre_newBoxLoop1End(i1)                                                                      \ */
-/*    }, hypre__tot);                                                                                    \ */
-/* } */
-
-
-
-
-
-
-/* BoxLoop 1 */
-/* without the extra function call to BoxLoopforall */
 #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
 {                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();                                  \
-   const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(hypre__tot, "thread", bDim);         \
-   hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)                         \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), [=] (sycl::nd_item<1> item)                \
+      size_t idx = item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
          hypre_BoxLoopIncK(1, databox1, i1);
 
 #define hypre_newBoxLoop1End(i1)                                                                      \
-      });                                                                                             \
-   }).wait_and_throw();                                                                               \
+      }                                                                                               \
+   }, hypre__tot);                                                                                    \
 }
 
 
 
 
 
-#define my_hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
-#define my_hypre_BoxLoop1End        hypre_newBoxLoop1End
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 
 
 
diff --git a/src/test/Makefile b/src/test/Makefile
index 5a4c606193..a05bb46feb 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -40,7 +40,8 @@ F77_COMPILE_FLAGS = \
 MPILIBFLAGS = ${MPILIBDIRS} ${MPILIBS} ${MPIFLAGS}
 LAPACKLIBFLAGS = ${LAPACKLIBDIRS} ${LAPACKLIBS}
 BLASLIBFLAGS = ${BLASLIBDIRS} ${BLASLIBS}
-LIBFLAGS = ${LDFLAGS} ${LIBS}
+# WM: had to add the absolute path to libHYPRE.a for successful compilation on frank
+LIBFLAGS = ${LDFLAGS} ${LIBS} ${HYPRE_BUILD_DIR}/lib/libHYPRE.a
 
 ifeq ($(notdir $(firstword ${LINK_CC})), nvcc)
    XLINK = -Xlinker=-rpath,${HYPRE_BUILD_DIR}/lib
diff --git a/src/test/simple.c b/src/test/simple.c
index e8a953beed..a52260e1df 100644
--- a/src/test/simple.c
+++ b/src/test/simple.c
@@ -15,7 +15,7 @@ HYPRE_Int AddValuesVector( hypre_StructGrid  *gridvector,
 
 
 HYPRE_Int
-my_hypre_StructVectorSetConstantValues( hypre_StructVector *vector,
+cpu_hypre_StructVectorSetConstantValues( hypre_StructVector *vector,
                                      HYPRE_Complex       values )
 {
    hypre_Box          *v_data_box;
@@ -49,12 +49,12 @@ my_hypre_StructVectorSetConstantValues( hypre_StructVector *vector,
       hypre_BoxGetSize(box, loop_size);
 
 #define DEVICE_VAR is_device_ptr(vp)
-      hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
+      zypre_newBoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                           v_data_box, start, unit_stride, vi);
       {
          vp[vi] = values;
       }
-      hypre_BoxLoop1End(vi);
+      zypre_newBoxLoop1End(vi);
 #undef DEVICE_VAR
    }
 
@@ -96,7 +96,6 @@ my_hypre_StructAxpy( HYPRE_Complex       alpha,
 
       hypre_BoxGetSize(box, loop_size);
 
-/* WM: what is the DEVICE_VAR thing? */
 #define DEVICE_VAR is_device_ptr(yp,xp)
       /* WM: todo */
       /* my_hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size, */
@@ -183,11 +182,11 @@ main( hypre_int argc,
    HYPRE_StructVector  x;
    HYPRE_Int           num_ghost[6]   = {0, 0, 0, 0, 0, 0};
 
-   dim = 2;
+   dim = 3;
    sym  = 1;
    nx = 10;
    ny = 10;
-   nz = 1;
+   nz = 10;
    bx = 1;
    by = 1;
    bz = 1;
@@ -398,14 +397,20 @@ main( hypre_int argc,
    AddValuesVector(grid,x,periodic,1.0);
    HYPRE_StructVectorAssemble(x);
 
+   hypre_StructVector *y = hypre_StructVectorClone(x);
+   hypre_StructVectorPrint("before", x, 1);
 
    /* call set const */
-   my_hypre_StructVectorSetConstantValues(x, 5.0);
+   cpu_hypre_StructVectorSetConstantValues(y, 5.0);
    hypre_printf("my_hypre_StructVectorSetConstantValues() success!\n");
 
+   hypre_StructVectorPrint("after_cpu", y, 1);
+
    hypre_StructVectorSetConstantValues(x, 5.0);
    hypre_printf("hypre_StructVectorSetConstantValues() success!\n");
 
+   hypre_StructVectorPrint("after_gpu", x, 1);
+
    /* call axpy */
    /* my_hypre_StructAxpy(1.0, x, b); */
 
diff --git a/src/utilities/memory.c b/src/utilities/memory.c
index ece79f5d68..55df0f6aa3 100644
--- a/src/utilities/memory.c
+++ b/src/utilities/memory.c
@@ -109,6 +109,7 @@ hypre_UnifiedMemset(void *ptr, HYPRE_Int value, size_t num)
 static inline void
 hypre_UnifiedMemPrefetch(void *ptr, size_t size, hypre_MemoryLocation location)
 {
+   /* hypre_printf("WM: debug - inside UnifiedMemPrefetch\n"); */
 #if defined(HYPRE_USING_GPU)
 #ifdef HYPRE_DEBUG
    hypre_MemoryLocation tmp;
@@ -244,6 +245,7 @@ hypre_DeviceMalloc(size_t size, HYPRE_Int zeroinit)
 static inline void *
 hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit)
 {
+   /* hypre_printf("WM: debug - inside UnifiedMalloc\n"); */
    void *ptr = NULL;
 
 #if defined(HYPRE_USING_UMPIRE_UM)
@@ -268,6 +270,7 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit)
 
 #if defined(HYPRE_USING_SYCL)
    HYPRE_SYCL_CALL( ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleComputeStream(hypre_handle()))) );
+   /* hypre_printf("WM: debug - did the sycl shared allocation\n"); */
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */
@@ -275,6 +278,7 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit)
    /* prefecth to device */
    if (ptr)
    {
+      /* hypre_printf("WM: debug - about to prefetch\n"); */
       hypre_UnifiedMemPrefetch(ptr, size, hypre_MEMORY_DEVICE);
    }
 
@@ -969,6 +973,7 @@ hypre_GetExecPolicy2(HYPRE_MemoryLocation location1,
 HYPRE_Int
 hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
 {
+   /* hypre_printf("WM: debug - inside GetPointerLocation\n"); */
    HYPRE_Int ierr = 0;
 
 #if defined(HYPRE_USING_GPU)
@@ -1071,6 +1076,7 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
    sycl::usm::alloc allocType;
    allocType = sycl::get_pointer_type(ptr, (hypre_HandleComputeStream(hypre_handle()))->get_context());
 
+   /* hypre_printf("WM: debug - checking allocType\n"); */
    if (allocType == sycl::usm::alloc::unknown)
    {
       *memory_location = hypre_MEMORY_HOST;
@@ -1086,6 +1092,7 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
    else if (allocType == sycl::usm::alloc::shared)
    {
       *memory_location = hypre_MEMORY_UNIFIED;
+      /* hypre_printf("WM: debug - IS UNIFIED MEMORY\n"); */
    }
 #endif //HYPRE_USING_SYCL
 

From f4d9ba405b09ee9c4ce6d1af56fec7c57873e0bc Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Wed, 29 Sep 2021 16:56:32 -0700
Subject: [PATCH 10/44] Resolve further merge conflicts, passes struct tests

---
 src/utilities/_hypre_utilities.h   | 2 +-
 src/utilities/_hypre_utilities.hpp | 2 +-
 src/utilities/device_utils.h       | 2 +-
 src/utilities/handle.h             | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h
index f66d302115..7faf7f9a1d 100644
--- a/src/utilities/_hypre_utilities.h
+++ b/src/utilities/_hypre_utilities.h
@@ -1285,7 +1285,7 @@ typedef struct
 #define hypre_HandleStructCommRecvBufferSize(hypre_handle)       hypre_DeviceDataStructCommRecvBufferSize(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleStructCommSendBufferSize(hypre_handle)       hypre_DeviceDataStructCommSendBufferSize(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleSpgemmUseCusparse(hypre_handle)              hypre_DeviceDataSpgemmUseCusparse(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpgemmNumPasses(hypre_handle)                hypre_DeviceDataSpgemmNumPasses(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmAlgorithm(hypre_handle)                hypre_DeviceDataSpgemmAlgorithm(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle)     hypre_DeviceDataSpgemmRownnzEstimateMethod(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle)   hypre_DeviceDataSpgemmRownnzEstimateNsamples(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMultFactor(hypre_HandleDeviceData(hypre_handle))
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index bc609bdc3d..61e8ae0998 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -320,7 +320,7 @@ struct hypre_DeviceData
 #define hypre_DeviceDataStructCommRecvBufferSize(data)       ((data) -> struct_comm_recv_buffer_size)
 #define hypre_DeviceDataStructCommSendBufferSize(data)       ((data) -> struct_comm_send_buffer_size)
 #define hypre_DeviceDataSpgemmUseCusparse(data)              ((data) -> spgemm_use_cusparse)
-#define hypre_DeviceDataSpgemmAlgorithm(data)                ((data) -> spgemm_num_passes)
+#define hypre_DeviceDataSpgemmAlgorithm(data)                ((data) -> spgemm_algorithm)
 #define hypre_DeviceDataSpgemmRownnzEstimateMethod(data)     ((data) -> spgemm_rownnz_estimate_method)
 #define hypre_DeviceDataSpgemmRownnzEstimateNsamples(data)   ((data) -> spgemm_rownnz_estimate_nsamples)
 #define hypre_DeviceDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor)
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index 41c6c41659..3483361926 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -268,7 +268,7 @@ struct hypre_DeviceData
 #define hypre_DeviceDataStructCommRecvBufferSize(data)       ((data) -> struct_comm_recv_buffer_size)
 #define hypre_DeviceDataStructCommSendBufferSize(data)       ((data) -> struct_comm_send_buffer_size)
 #define hypre_DeviceDataSpgemmUseCusparse(data)              ((data) -> spgemm_use_cusparse)
-#define hypre_DeviceDataSpgemmAlgorithm(data)                ((data) -> spgemm_num_passes)
+#define hypre_DeviceDataSpgemmAlgorithm(data)                ((data) -> spgemm_algorithm)
 #define hypre_DeviceDataSpgemmRownnzEstimateMethod(data)     ((data) -> spgemm_rownnz_estimate_method)
 #define hypre_DeviceDataSpgemmRownnzEstimateNsamples(data)   ((data) -> spgemm_rownnz_estimate_nsamples)
 #define hypre_DeviceDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor)
diff --git a/src/utilities/handle.h b/src/utilities/handle.h
index c49fa5fdf3..8e5979c7a2 100644
--- a/src/utilities/handle.h
+++ b/src/utilities/handle.h
@@ -74,7 +74,7 @@ typedef struct
 #define hypre_HandleStructCommRecvBufferSize(hypre_handle)       hypre_DeviceDataStructCommRecvBufferSize(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleStructCommSendBufferSize(hypre_handle)       hypre_DeviceDataStructCommSendBufferSize(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleSpgemmUseCusparse(hypre_handle)              hypre_DeviceDataSpgemmUseCusparse(hypre_HandleDeviceData(hypre_handle))
-#define hypre_HandleSpgemmNumPasses(hypre_handle)                hypre_DeviceDataSpgemmNumPasses(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleSpgemmAlgorithm(hypre_handle)                hypre_DeviceDataSpgemmAlgorithm(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleSpgemmRownnzEstimateMethod(hypre_handle)     hypre_DeviceDataSpgemmRownnzEstimateMethod(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleSpgemmRownnzEstimateNsamples(hypre_handle)   hypre_DeviceDataSpgemmRownnzEstimateNsamples(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleSpgemmRownnzEstimateMultFactor(hypre_handle) hypre_DeviceDataSpgemmRownnzEstimateMultFactor(hypre_HandleDeviceData(hypre_handle))

From 845a433bd44bc2fca5c493f625f6098eaf63555c Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Thu, 30 Sep 2021 10:53:29 -0700
Subject: [PATCH 11/44] Non-reduction boxloops done

The non-reduction boxloops are all in and pass the struct tests.
Performance is VERY slow, but this may just be due to the machine
I am running on. Reduction boxloops are in progress.
---
 src/struct_ls/_hypre_struct_ls.h   |   2 +-
 src/struct_ls/protos.h             |   5 +
 src/struct_mv/_hypre_struct_mv.hpp | 310 +++++++++++++++++++++++++++--
 src/struct_mv/boxloop_sycl.h       | 306 ++++++++++++++++++++++++++--
 4 files changed, 580 insertions(+), 43 deletions(-)

diff --git a/src/struct_ls/_hypre_struct_ls.h b/src/struct_ls/_hypre_struct_ls.h
index f8a753e6ee..4078385df0 100644
--- a/src/struct_ls/_hypre_struct_ls.h
+++ b/src/struct_ls/_hypre_struct_ls.h
@@ -450,9 +450,9 @@ HYPRE_Int hypre_SparseMSGSetupRAPOp ( hypre_StructMatrix *R , hypre_StructMatrix
 /* sparse_msg_solve.c */
 HYPRE_Int hypre_SparseMSGSolve ( void *smsg_vdata , hypre_StructMatrix *A , hypre_StructVector *b , hypre_StructVector *x );
 
-
 #ifdef __cplusplus
 }
 #endif
 
 #endif
+
diff --git a/src/struct_ls/protos.h b/src/struct_ls/protos.h
index 67540ac062..a7187c4016 100644
--- a/src/struct_ls/protos.h
+++ b/src/struct_ls/protos.h
@@ -5,6 +5,11 @@
  * SPDX-License-Identifier: (Apache-2.0 OR MIT)
  ******************************************************************************/
 
+/* coarsen.c */
+HYPRE_Int hypre_StructMapFineToCoarse ( hypre_Index findex , hypre_Index index , hypre_Index stride , hypre_Index cindex );
+HYPRE_Int hypre_StructMapCoarseToFine ( hypre_Index cindex , hypre_Index index , hypre_Index stride , hypre_Index findex );
+HYPRE_Int hypre_StructCoarsen ( hypre_StructGrid *fgrid , hypre_Index index , hypre_Index stride , HYPRE_Int prune , hypre_StructGrid **cgrid_ptr );
+
 /* cyclic_reduction.c */
 void *hypre_CyclicReductionCreate ( MPI_Comm comm );
 hypre_StructMatrix *hypre_CycRedCreateCoarseOp ( hypre_StructMatrix *A , hypre_StructGrid *coarse_grid , HYPRE_Int cdir );
diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index d1866014f1..de88e1b0cd 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1184,7 +1184,7 @@ extern "C++" {
 #endif
 
 /*********************************************************************
- * forall function
+ * wrapper functions calling sycl parallel_for
  *********************************************************************/
 
 template<typename LOOP_BODY>
@@ -1220,6 +1220,59 @@ BoxLoopforall( LOOP_BODY loop_body,
    }
 }
 
+template<typename LOOP_BODY, typename REDUCER>
+void
+ReductionBoxLoopforall( HYPRE_Int  length,
+                        REDUCER   &reducer,
+                        LOOP_BODY  loop_body )
+{
+   if (length <= 0)
+   {
+      return;
+   }
+
+   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
+   /* WM: TODO: uncomment above and remove below */
+   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
+
+   if (exec_policy == HYPRE_EXEC_HOST)
+   {
+      /* WM: todo - is this really necessary, even? */
+      /* for (HYPRE_Int idx = 0; idx < length; idx++) */
+      /* { */
+      /*    loop_body(idx, reducer); */
+      /* } */
+   }
+   else if (exec_policy == HYPRE_EXEC_DEVICE)
+   {
+      /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */
+      /* NOTE: in the cuda version, there is further manipulation of bDim and gDim that I don't include here */
+      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
+      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+
+      reducer.nblocks = gDim.size();
+
+      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
+         {
+            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body);
+         }).wait_and_throw();
+   }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 #ifdef __cplusplus
 }
 #endif
@@ -1272,11 +1325,46 @@ BoxLoopforall( LOOP_BODY loop_body,
       databox##k.bsize2   = 0;                                             \
    }
 
+#define hypre_BasicBoxLoopDataDeclareK(k,ndim,loop_size,stride) \
+hypre_Boxloop databox##k;                                       \
+databox##k.lsize0   = loop_size[0];                             \
+databox##k.strides0 = stride[0];                                \
+databox##k.bstart0  = 0;                                        \
+databox##k.bsize0   = 0;                                        \
+if (ndim > 1)                                                   \
+{                                                               \
+   databox##k.lsize1   = loop_size[1];                          \
+   databox##k.strides1 = stride[1];                             \
+   databox##k.bstart1  = 0;                                     \
+   databox##k.bsize1   = 0;                                     \
+}                                                               \
+else                                                            \
+{                                                               \
+   databox##k.lsize1   = 1;                                     \
+   databox##k.strides1 = 0;                                     \
+   databox##k.bstart1  = 0;                                     \
+   databox##k.bsize1   = 0;                                     \
+}                                                               \
+if (ndim == 3)                                                  \
+{                                                               \
+   databox##k.lsize2   = loop_size[2];                          \
+   databox##k.strides2 = stride[2];                             \
+   databox##k.bstart2  = 0;                                     \
+   databox##k.bsize2   = 0;                                     \
+}                                                               \
+else                                                            \
+{                                                               \
+    databox##k.lsize2   = 1;                                    \
+    databox##k.strides2 = 0;                                    \
+    databox##k.bstart2  = 0;                                    \
+    databox##k.bsize2   = 0;                                    \
+}
+
 /* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */
 /* WM: todo - double check that item.get_local_id(0) is actually what you want below */
 #define hypre_newBoxLoopDeclare(box)                     \
    hypre_Index local_idx;                                \
-   HYPRE_Int idx_local = (HYPRE_Int) idx;              \
+   HYPRE_Int idx_local = idx;              \
    hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
    idx_local = idx_local / box.lsize0;                   \
    hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
@@ -1294,8 +1382,30 @@ BoxLoopforall( LOOP_BODY loop_body,
    hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \
    hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);
 
+/* get 3-D local_idx into 'index' */
+#define hypre_BoxLoopGetIndex(index)      \
+   index[0] = hypre_IndexD(local_idx, 0); \
+   index[1] = hypre_IndexD(local_idx, 1); \
+   index[2] = hypre_IndexD(local_idx, 2);
+
+
+
+/*********************************************************************
+ * Boxloops
+ *********************************************************************/
 
 
+/* BoxLoop 0 */
+#define hypre_newBoxLoop0Begin(ndim, loop_size)                                                       \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   BoxLoopforall(hypre__tot, HYPRE_LAMBDA (HYPRE_Int idx)                                             \
+   {
+
+#define hypre_newBoxLoop0End()                                                                        \
+   });                                                                                                \
+}
+
 /* BoxLoop 1 */
 #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
 {                                                                                                     \
@@ -1303,7 +1413,7 @@ BoxLoopforall( LOOP_BODY loop_body,
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      size_t idx = item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -1314,6 +1424,147 @@ BoxLoopforall( LOOP_BODY loop_body,
    }, hypre__tot);                                                                                    \
 }
 
+/* BoxLoop 2 */
+#define hypre_newBoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1,                           \
+                                                dbox2, start2, stride2, i2)                           \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);                                                          \
+         hypre_BoxLoopIncK(2, databox2, i2);
+
+#define hypre_newBoxLoop2End(i1, i2)                                                                  \
+      }                                                                                               \
+   }, hypre__tot);                                                                                    \
+}
+
+/* BoxLoop 3 */
+#define hypre_newBoxLoop3Begin(ndim, loop_size, dbox1, start1, stride1, i1,                           \
+                                                dbox2, start2, stride2, i2,                           \
+                                                dbox3, start3, stride3, i3)                           \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim,loop_size, dbox1, start1, stride1);                              \
+   hypre_BoxLoopDataDeclareK(2, ndim,loop_size, dbox2, start2, stride2);                              \
+   hypre_BoxLoopDataDeclareK(3, ndim,loop_size, dbox3, start3, stride3);                              \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);                                                          \
+         hypre_BoxLoopIncK(2, databox2, i2);                                                          \
+         hypre_BoxLoopIncK(3, databox3, i3);
+
+#define hypre_newBoxLoop3End(i1, i2, i3)                                                              \
+      }                                                                                               \
+   }, hypre__tot);                                                                                    \
+}
+
+/* BoxLoop 4 */
+#define hypre_newBoxLoop4Begin(ndim, loop_size, dbox1, start1, stride1, i1,                           \
+                                                dbox2, start2, stride2, i2,                           \
+                                                dbox3, start3, stride3, i3,                           \
+                                                dbox4, start4, stride4, i4)                           \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
+   hypre_BoxLoopDataDeclareK(3, ndim, loop_size, dbox3, start3, stride3);                             \
+   hypre_BoxLoopDataDeclareK(4, ndim, loop_size, dbox4, start4, stride4);                             \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);                                                          \
+         hypre_BoxLoopIncK(2, databox2, i2);                                                          \
+         hypre_BoxLoopIncK(3, databox3, i3);                                                          \
+         hypre_BoxLoopIncK(4, databox4, i4);
+
+#define hypre_newBoxLoop4End(i1, i2, i3, i4)                                                          \
+      }                                                                                               \
+   }, hypre__tot);                                                                                    \
+}
+
+
+/* Basic BoxLoops have no boxes */
+/* BoxLoop 1 */
+#define hypre_newBasicBoxLoop1Begin(ndim, loop_size, stride1, i1)                                     \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1);                                       \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);
+
+/* BoxLoop 2 */
+#define hypre_newBasicBoxLoop2Begin(ndim, loop_size, stride1, i1, stride2, i2)                        \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1);                                       \
+   hypre_BasicBoxLoopDataDeclareK(2, ndim, loop_size, stride2);                                       \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);                                                          \
+         hypre_BoxLoopIncK(2, databox2, i2);
+
+
+/* Reduction BoxLoop1 */
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum)                     \
+{                                                                                                                \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                                        \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                                        \
+   BoxLoopforall( [=] (sycl::nd_item<1> item, decltype(reducesum) &reducesum)                                                         \
+   {                                                                                                  \
+      hypre_newBoxLoopDeclare(databox1);                                                                         \
+      hypre_BoxLoopIncK(1, databox1, i1);\
+      const HYPRE_Int thread_id = (HYPRE_Int) item.get_global_linear_id();\
+      const HYPRE_Int n_threads = (HYPRE_Int) item.get_global_range().size();\
+      for (HYPRE_Int idx = thread_id; idx < length; idx += n_threads)\
+      {
+
+#define hypre_newBoxLoop1ReductionEnd(i1, reducesum)                                                                \
+         reducer.BlockReduce();\
+      }                                                                                               \
+   }, hypre__tot);                                                                                    \
+}
+
+/* Reduction BoxLoop2 */
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1,                                \
+                                                      dbox2, start2, stride2, i2, reducesum)                     \
+{                                                                                                                \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                                        \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                                        \
+   hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                                        \
+   ReductionBoxLoopforall(hypre__tot, reducesum, HYPRE_LAMBDA (HYPRE_Int idx, decltype(reducesum) &reducesum)    \
+   {                                                                                                             \
+      hypre_newBoxLoopDeclare(databox1);                                                                         \
+      hypre_BoxLoopIncK(1, databox1, i1);                                                                        \
+      hypre_BoxLoopIncK(2, databox2, i2);
+
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, reducesum)                                                            \
+   });                                                                                                           \
+}
+
+
 
 
 
@@ -1549,26 +1800,41 @@ BoxLoopforall( LOOP_BODY loop_body,
   }                                                                           \
 }
 
-#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
 
-#define hypre_BoxLoopBlock       zypre_BoxLoopBlock
-#define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
-#define hypre_BoxLoop0End        zypre_newBoxLoop0End
-/* WM: replacing boxloops one at a time starting with boxloop1 */
-/* #define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin */
-/* #define hypre_BoxLoop1End        hypre_newBoxLoop1End */
-#define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin
-#define hypre_BoxLoop1End        zypre_newBoxLoop1End
-#define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
-#define hypre_BoxLoop2End        zypre_newBoxLoop2End
-#define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin
-#define hypre_BoxLoop3End        zypre_newBoxLoop3End
-#define hypre_BoxLoop4Begin      zypre_newBoxLoop4Begin
-#define hypre_BoxLoop4End        zypre_newBoxLoop4End
-#define hypre_BasicBoxLoop2Begin zypre_newBasicBoxLoop2Begin
+
+
+
+
+
+
+
+
+
+
+
+
+/*********************************************************************
+ * renamings
+ *********************************************************************/
+
+#define hypre_BoxLoopBlock()       0
+
+#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
+#define hypre_BoxLoop0End        hypre_newBoxLoop0End
+#define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
+#define hypre_BoxLoop1End        hypre_newBoxLoop1End
+#define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
+#define hypre_BoxLoop2End        hypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      hypre_newBoxLoop3Begin
+#define hypre_BoxLoop3End        hypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      hypre_newBoxLoop4Begin
+#define hypre_BoxLoop4End        hypre_newBoxLoop4End
+
+#define hypre_BasicBoxLoop1Begin hypre_newBasicBoxLoop1Begin
+#define hypre_BasicBoxLoop2Begin hypre_newBasicBoxLoop2Begin
 
 /* Reduction */
-/* WM: todo */
+/* WM: todo - using CPU version for now */
 #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \
         zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)
 
@@ -1577,11 +1843,11 @@ BoxLoopforall( LOOP_BODY loop_body,
 
 #define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \
                                                       dbox2, start2, stride2, i2, reducesum) \
-        hypre_BoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \
+        zypre_newBoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \
                                              dbox2, start2, stride2, i2)
 
 #define hypre_BoxLoop2ReductionEnd(i1, i2, reducesum) \
-        hypre_BoxLoop2End(i1, i2)
+        zypre_newBoxLoop2End(i1, i2)
 
 #endif
 
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index e4ac919d90..b2cb757231 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -32,7 +32,7 @@ extern "C++" {
 #endif
 
 /*********************************************************************
- * forall function
+ * wrapper functions calling sycl parallel_for
  *********************************************************************/
 
 template<typename LOOP_BODY>
@@ -68,6 +68,59 @@ BoxLoopforall( LOOP_BODY loop_body,
    }
 }
 
+template<typename LOOP_BODY, typename REDUCER>
+void
+ReductionBoxLoopforall( HYPRE_Int  length,
+                        REDUCER   &reducer,
+                        LOOP_BODY  loop_body )
+{
+   if (length <= 0)
+   {
+      return;
+   }
+
+   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
+   /* WM: TODO: uncomment above and remove below */
+   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
+
+   if (exec_policy == HYPRE_EXEC_HOST)
+   {
+      /* WM: todo - is this really necessary, even? */
+      /* for (HYPRE_Int idx = 0; idx < length; idx++) */
+      /* { */
+      /*    loop_body(idx, reducer); */
+      /* } */
+   }
+   else if (exec_policy == HYPRE_EXEC_DEVICE)
+   {
+      /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */
+      /* NOTE: in the cuda version, there is further manipulation of bDim and gDim that I don't include here */
+      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
+      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+
+      reducer.nblocks = gDim.size();
+
+      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
+         {
+            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body);
+         }).wait_and_throw();
+   }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 #ifdef __cplusplus
 }
 #endif
@@ -120,11 +173,46 @@ BoxLoopforall( LOOP_BODY loop_body,
       databox##k.bsize2   = 0;                                             \
    }
 
+#define hypre_BasicBoxLoopDataDeclareK(k,ndim,loop_size,stride) \
+hypre_Boxloop databox##k;                                       \
+databox##k.lsize0   = loop_size[0];                             \
+databox##k.strides0 = stride[0];                                \
+databox##k.bstart0  = 0;                                        \
+databox##k.bsize0   = 0;                                        \
+if (ndim > 1)                                                   \
+{                                                               \
+   databox##k.lsize1   = loop_size[1];                          \
+   databox##k.strides1 = stride[1];                             \
+   databox##k.bstart1  = 0;                                     \
+   databox##k.bsize1   = 0;                                     \
+}                                                               \
+else                                                            \
+{                                                               \
+   databox##k.lsize1   = 1;                                     \
+   databox##k.strides1 = 0;                                     \
+   databox##k.bstart1  = 0;                                     \
+   databox##k.bsize1   = 0;                                     \
+}                                                               \
+if (ndim == 3)                                                  \
+{                                                               \
+   databox##k.lsize2   = loop_size[2];                          \
+   databox##k.strides2 = stride[2];                             \
+   databox##k.bstart2  = 0;                                     \
+   databox##k.bsize2   = 0;                                     \
+}                                                               \
+else                                                            \
+{                                                               \
+    databox##k.lsize2   = 1;                                    \
+    databox##k.strides2 = 0;                                    \
+    databox##k.bstart2  = 0;                                    \
+    databox##k.bsize2   = 0;                                    \
+}
+
 /* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */
 /* WM: todo - double check that item.get_local_id(0) is actually what you want below */
 #define hypre_newBoxLoopDeclare(box)                     \
    hypre_Index local_idx;                                \
-   HYPRE_Int idx_local = (HYPRE_Int) idx;              \
+   HYPRE_Int idx_local = idx;              \
    hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
    idx_local = idx_local / box.lsize0;                   \
    hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
@@ -142,7 +230,29 @@ BoxLoopforall( LOOP_BODY loop_body,
    hypre__i += (hypre_IndexD(local_idx, 2) * box.strides2 + box.bstart2) * hypre_boxD##k; \
    hypre_boxD##k *= hypre_max(0, box.bsize2 + 1);
 
+/* get 3-D local_idx into 'index' */
+#define hypre_BoxLoopGetIndex(index)      \
+   index[0] = hypre_IndexD(local_idx, 0); \
+   index[1] = hypre_IndexD(local_idx, 1); \
+   index[2] = hypre_IndexD(local_idx, 2);
+
+
+
+/*********************************************************************
+ * Boxloops
+ *********************************************************************/
+
+
+/* BoxLoop 0 */
+#define hypre_newBoxLoop0Begin(ndim, loop_size)                                                       \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   BoxLoopforall(hypre__tot, HYPRE_LAMBDA (HYPRE_Int idx)                                             \
+   {
 
+#define hypre_newBoxLoop0End()                                                                        \
+   });                                                                                                \
+}
 
 /* BoxLoop 1 */
 #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
@@ -151,7 +261,7 @@ BoxLoopforall( LOOP_BODY loop_body,
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      size_t idx = item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -162,6 +272,147 @@ BoxLoopforall( LOOP_BODY loop_body,
    }, hypre__tot);                                                                                    \
 }
 
+/* BoxLoop 2 */
+#define hypre_newBoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1,                           \
+                                                dbox2, start2, stride2, i2)                           \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);                                                          \
+         hypre_BoxLoopIncK(2, databox2, i2);
+
+#define hypre_newBoxLoop2End(i1, i2)                                                                  \
+      }                                                                                               \
+   }, hypre__tot);                                                                                    \
+}
+
+/* BoxLoop 3 */
+#define hypre_newBoxLoop3Begin(ndim, loop_size, dbox1, start1, stride1, i1,                           \
+                                                dbox2, start2, stride2, i2,                           \
+                                                dbox3, start3, stride3, i3)                           \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim,loop_size, dbox1, start1, stride1);                              \
+   hypre_BoxLoopDataDeclareK(2, ndim,loop_size, dbox2, start2, stride2);                              \
+   hypre_BoxLoopDataDeclareK(3, ndim,loop_size, dbox3, start3, stride3);                              \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);                                                          \
+         hypre_BoxLoopIncK(2, databox2, i2);                                                          \
+         hypre_BoxLoopIncK(3, databox3, i3);
+
+#define hypre_newBoxLoop3End(i1, i2, i3)                                                              \
+      }                                                                                               \
+   }, hypre__tot);                                                                                    \
+}
+
+/* BoxLoop 4 */
+#define hypre_newBoxLoop4Begin(ndim, loop_size, dbox1, start1, stride1, i1,                           \
+                                                dbox2, start2, stride2, i2,                           \
+                                                dbox3, start3, stride3, i3,                           \
+                                                dbox4, start4, stride4, i4)                           \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
+   hypre_BoxLoopDataDeclareK(3, ndim, loop_size, dbox3, start3, stride3);                             \
+   hypre_BoxLoopDataDeclareK(4, ndim, loop_size, dbox4, start4, stride4);                             \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);                                                          \
+         hypre_BoxLoopIncK(2, databox2, i2);                                                          \
+         hypre_BoxLoopIncK(3, databox3, i3);                                                          \
+         hypre_BoxLoopIncK(4, databox4, i4);
+
+#define hypre_newBoxLoop4End(i1, i2, i3, i4)                                                          \
+      }                                                                                               \
+   }, hypre__tot);                                                                                    \
+}
+
+
+/* Basic BoxLoops have no boxes */
+/* BoxLoop 1 */
+#define hypre_newBasicBoxLoop1Begin(ndim, loop_size, stride1, i1)                                     \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1);                                       \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);
+
+/* BoxLoop 2 */
+#define hypre_newBasicBoxLoop2Begin(ndim, loop_size, stride1, i1, stride2, i2)                        \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1);                                       \
+   hypre_BasicBoxLoopDataDeclareK(2, ndim, loop_size, stride2);                                       \
+   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);                                                          \
+         hypre_BoxLoopIncK(2, databox2, i2);
+
+
+/* Reduction BoxLoop1 */
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum)                     \
+{                                                                                                                \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                                        \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                                        \
+   BoxLoopforall( [=] (sycl::nd_item<1> item, decltype(reducesum) &reducesum)                                                         \
+   {                                                                                                  \
+      hypre_newBoxLoopDeclare(databox1);                                                                         \
+      hypre_BoxLoopIncK(1, databox1, i1);\
+      const HYPRE_Int thread_id = (HYPRE_Int) item.get_global_linear_id();\
+      const HYPRE_Int n_threads = (HYPRE_Int) item.get_global_range().size();\
+      for (HYPRE_Int idx = thread_id; idx < length; idx += n_threads)\
+      {
+
+#define hypre_newBoxLoop1ReductionEnd(i1, reducesum)                                                                \
+         reducer.BlockReduce();\
+      }                                                                                               \
+   }, hypre__tot);                                                                                    \
+}
+
+/* Reduction BoxLoop2 */
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1,                                \
+                                                      dbox2, start2, stride2, i2, reducesum)                     \
+{                                                                                                                \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                                        \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                                        \
+   hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                                        \
+   ReductionBoxLoopforall(hypre__tot, reducesum, HYPRE_LAMBDA (HYPRE_Int idx, decltype(reducesum) &reducesum)    \
+   {                                                                                                             \
+      hypre_newBoxLoopDeclare(databox1);                                                                         \
+      hypre_BoxLoopIncK(1, databox1, i1);                                                                        \
+      hypre_BoxLoopIncK(2, databox2, i2);
+
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, reducesum)                                                            \
+   });                                                                                                           \
+}
+
+
 
 
 
@@ -397,26 +648,41 @@ BoxLoopforall( LOOP_BODY loop_body,
   }                                                                           \
 }
 
-#define hypre_BoxLoopGetIndex    zypre_BoxLoopGetIndex
 
-#define hypre_BoxLoopBlock       zypre_BoxLoopBlock
-#define hypre_BoxLoop0Begin      zypre_newBoxLoop0Begin
-#define hypre_BoxLoop0End        zypre_newBoxLoop0End
-/* WM: replacing boxloops one at a time starting with boxloop1 */
+
+
+
+
+
+
+
+
+
+
+
+
+/*********************************************************************
+ * renamings
+ *********************************************************************/
+
+#define hypre_BoxLoopBlock()       0
+
+#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
+#define hypre_BoxLoop0End        hypre_newBoxLoop0End
 #define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
 #define hypre_BoxLoop1End        hypre_newBoxLoop1End
-/* #define hypre_BoxLoop1Begin      zypre_newBoxLoop1Begin */
-/* #define hypre_BoxLoop1End        zypre_newBoxLoop1End */
-#define hypre_BoxLoop2Begin      zypre_newBoxLoop2Begin
-#define hypre_BoxLoop2End        zypre_newBoxLoop2End
-#define hypre_BoxLoop3Begin      zypre_newBoxLoop3Begin
-#define hypre_BoxLoop3End        zypre_newBoxLoop3End
-#define hypre_BoxLoop4Begin      zypre_newBoxLoop4Begin
-#define hypre_BoxLoop4End        zypre_newBoxLoop4End
-#define hypre_BasicBoxLoop2Begin zypre_newBasicBoxLoop2Begin
+#define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
+#define hypre_BoxLoop2End        hypre_newBoxLoop2End
+#define hypre_BoxLoop3Begin      hypre_newBoxLoop3Begin
+#define hypre_BoxLoop3End        hypre_newBoxLoop3End
+#define hypre_BoxLoop4Begin      hypre_newBoxLoop4Begin
+#define hypre_BoxLoop4End        hypre_newBoxLoop4End
+
+#define hypre_BasicBoxLoop1Begin hypre_newBasicBoxLoop1Begin
+#define hypre_BasicBoxLoop2Begin hypre_newBasicBoxLoop2Begin
 
 /* Reduction */
-/* WM: todo */
+/* WM: todo - using CPU version for now */
 #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \
         zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)
 
@@ -425,10 +691,10 @@ BoxLoopforall( LOOP_BODY loop_body,
 
 #define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \
                                                       dbox2, start2, stride2, i2, reducesum) \
-        hypre_BoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \
+        zypre_newBoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \
                                              dbox2, start2, stride2, i2)
 
 #define hypre_BoxLoop2ReductionEnd(i1, i2, reducesum) \
-        hypre_BoxLoop2End(i1, i2)
+        zypre_newBoxLoop2End(i1, i2)
 
 #endif

From 4ed00c48458dfeca2f5d8738c56a3908fc878139 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Thu, 30 Sep 2021 17:34:24 -0700
Subject: [PATCH 12/44] First attempt at reduction boxloops, seg faulting right
 now

---
 src/struct_ls/pfmg_setup.c         | 43 +++++++++++++-----
 src/struct_mv/_hypre_struct_mv.hpp | 71 ++++++++---------------------
 src/struct_mv/boxloop_sycl.h       | 73 +++++++++---------------------
 src/utilities/device_utils.h       |  6 +++
 4 files changed, 79 insertions(+), 114 deletions(-)

diff --git a/src/struct_ls/pfmg_setup.c b/src/struct_ls/pfmg_setup.c
index b3db006d6e..fbf91d16d3 100644
--- a/src/struct_ls/pfmg_setup.c
+++ b/src/struct_ls/pfmg_setup.c
@@ -809,18 +809,23 @@ hypre_PFMGComputeDxyz( hypre_StructMatrix *A,
          switch (stencil_size)
          {
             case 5:
+               hypre_printf("WM: debug - stencil size = 5\n");
                hypre_PFMGComputeDxyz_SS5 (i, A, cxyz, sqcxyz);
                break;
             case 9:
+               hypre_printf("WM: debug - stencil size = 9\n");
                hypre_PFMGComputeDxyz_SS9 (i, A, cxyz, sqcxyz);
                break;
             case 7:
+               hypre_printf("WM: debug - stencil size = 7\n");
                hypre_PFMGComputeDxyz_SS7 (i, A, cxyz, sqcxyz);
                break;
             case 19:
+               hypre_printf("WM: debug - stencil size = 19\n");
                hypre_PFMGComputeDxyz_SS19(i, A, cxyz, sqcxyz);
                break;
             case 27:
+               hypre_printf("WM: debug - stencil size = 27\n");
                hypre_PFMGComputeDxyz_SS27(i, A, cxyz, sqcxyz);
                break;
             default:
@@ -1051,50 +1056,66 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int           bi,
    hypre_SetIndex3(index,  0,  1, 0);
    a_cn = hypre_StructMatrixExtractPointerByIndex(A, bi, index);
 
-   // FIXME TODO HOW TO DO KOKKOS IN ONE BOXLOOP ?
-#if defined(HYPRE_USING_KOKKOS)
+   // FIXME TODO HOW TO DO KOKKOS (WM: and SYCL) IN ONE BOXLOOP ?
+#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL)
 
    HYPRE_Real cxb = cxyz[0];
-   hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+   hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
                                 A_dbox, start, stride, Ai, cxb)
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcx;
+#else
       cxb += tcx;
+#endif
    }
-   hypre_BoxLoop1ReductionEnd(Ai, cxb)
+   hypre_newBoxLoop1ReductionEnd(Ai, cxb)
 
    HYPRE_Real cyb = cxyz[1];
-   hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+   hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
                                 A_dbox, start, stride, Ai, cyb)
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cn[Ai] + a_cs[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcy;
+#else
       cyb += tcy;
+#endif
    }
-   hypre_BoxLoop1ReductionEnd(Ai, cyb)
+   hypre_newBoxLoop1ReductionEnd(Ai, cyb)
 
    HYPRE_Real sqcxb = sqcxyz[0];
-   hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+   hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
                                 A_dbox, start, stride, Ai, sqcxb)
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcx * tcx;
+#else
       sqcxb += tcx * tcx;
+#endif
    }
-   hypre_BoxLoop1ReductionEnd(Ai, sqcxb)
+   hypre_newBoxLoop1ReductionEnd(Ai, sqcxb)
 
    HYPRE_Real sqcyb = sqcxyz[1];
-   hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+   hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
                                 A_dbox, start, stride, Ai, sqcyb)
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cn[Ai] + a_cs[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcy * tcy;
+#else
       sqcyb += tcy * tcy;
+#endif
    }
-   hypre_BoxLoop1ReductionEnd(Ai, sqcyb)
+   hypre_newBoxLoop1ReductionEnd(Ai, sqcyb)
 
-#else /* kokkos */
+#else // #if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL)
 
 #if defined(HYPRE_USING_RAJA)
    ReduceSum<hypre_raja_reduce_policy, HYPRE_Real> cxb(cxyz[0]),cyb(cxyz[1]),sqcxb(sqcxyz[0]),sqcyb(sqcxyz[1]);
diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index de88e1b0cd..13982b41cb 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1220,11 +1220,11 @@ BoxLoopforall( LOOP_BODY loop_body,
    }
 }
 
-template<typename LOOP_BODY, typename REDUCER>
+template<typename LOOP_BODY>
 void
-ReductionBoxLoopforall( HYPRE_Int  length,
-                        REDUCER   &reducer,
-                        LOOP_BODY  loop_body )
+ReductionBoxLoopforall( LOOP_BODY  loop_body,
+                        HYPRE_Int length,
+                        sycl::buffer<HYPRE_Real> sum_buf )
 {
    if (length <= 0)
    {
@@ -1250,11 +1250,10 @@ ReductionBoxLoopforall( HYPRE_Int  length,
       const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
       const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
 
-      reducer.nblocks = gDim.size();
-
       hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
          {
-            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body);
+            sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write);
+            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), loop_body);
          }).wait_and_throw();
    }
 }
@@ -1394,18 +1393,6 @@ else                                                            \
  * Boxloops
  *********************************************************************/
 
-
-/* BoxLoop 0 */
-#define hypre_newBoxLoop0Begin(ndim, loop_size)                                                       \
-{                                                                                                     \
-   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
-   BoxLoopforall(hypre__tot, HYPRE_LAMBDA (HYPRE_Int idx)                                             \
-   {
-
-#define hypre_newBoxLoop0End()                                                                        \
-   });                                                                                                \
-}
-
 /* BoxLoop 1 */
 #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
 {                                                                                                     \
@@ -1528,40 +1515,24 @@ else                                                            \
 
 
 /* Reduction BoxLoop1 */
-#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum)                     \
+/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */
+/* Right now, it is hardcoded as a HYPRE_Real */
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var)                     \
 {                                                                                                                \
-   hypre_newBoxLoopInit(ndim, loop_size);                                                                        \
-   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                                        \
-   BoxLoopforall( [=] (sycl::nd_item<1> item, decltype(reducesum) &reducesum)                                                         \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                    \
+   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                                         \
    {                                                                                                  \
-      hypre_newBoxLoopDeclare(databox1);                                                                         \
-      hypre_BoxLoopIncK(1, databox1, i1);\
-      const HYPRE_Int thread_id = (HYPRE_Int) item.get_global_linear_id();\
-      const HYPRE_Int n_threads = (HYPRE_Int) item.get_global_range().size();\
-      for (HYPRE_Int idx = thread_id; idx < length; idx += n_threads)\
-      {
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);
 
-#define hypre_newBoxLoop1ReductionEnd(i1, reducesum)                                                                \
-         reducer.BlockReduce();\
+#define hypre_newBoxLoop1ReductionEnd(i1, sum_var)                                                                \
       }                                                                                               \
-   }, hypre__tot);                                                                                    \
-}
-
-/* Reduction BoxLoop2 */
-#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1,                                \
-                                                      dbox2, start2, stride2, i2, reducesum)                     \
-{                                                                                                                \
-   hypre_newBoxLoopInit(ndim, loop_size);                                                                        \
-   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                                        \
-   hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                                        \
-   ReductionBoxLoopforall(hypre__tot, reducesum, HYPRE_LAMBDA (HYPRE_Int idx, decltype(reducesum) &reducesum)    \
-   {                                                                                                             \
-      hypre_newBoxLoopDeclare(databox1);                                                                         \
-      hypre_BoxLoopIncK(1, databox1, i1);                                                                        \
-      hypre_BoxLoopIncK(2, databox2, i2);
-
-#define hypre_newBoxLoop2ReductionEnd(i1, i2, reducesum)                                                            \
-   });                                                                                                           \
+   }, hypre__tot, sum_buf);                                                                                    \
 }
 
 
@@ -1819,8 +1790,6 @@ else                                                            \
 
 #define hypre_BoxLoopBlock()       0
 
-#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
-#define hypre_BoxLoop0End        hypre_newBoxLoop0End
 #define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
 #define hypre_BoxLoop1End        hypre_newBoxLoop1End
 #define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index b2cb757231..0a4fae81f0 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -68,11 +68,11 @@ BoxLoopforall( LOOP_BODY loop_body,
    }
 }
 
-template<typename LOOP_BODY, typename REDUCER>
+template<typename LOOP_BODY>
 void
-ReductionBoxLoopforall( HYPRE_Int  length,
-                        REDUCER   &reducer,
-                        LOOP_BODY  loop_body )
+ReductionBoxLoopforall( LOOP_BODY  loop_body,
+                        HYPRE_Int length,
+                        sycl::buffer<HYPRE_Real> sum_buf )
 {
    if (length <= 0)
    {
@@ -98,11 +98,10 @@ ReductionBoxLoopforall( HYPRE_Int  length,
       const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
       const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
 
-      reducer.nblocks = gDim.size();
-
       hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
          {
-            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body);
+            sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write);
+            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), loop_body);
          }).wait_and_throw();
    }
 }
@@ -242,18 +241,6 @@ else                                                            \
  * Boxloops
  *********************************************************************/
 
-
-/* BoxLoop 0 */
-#define hypre_newBoxLoop0Begin(ndim, loop_size)                                                       \
-{                                                                                                     \
-   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
-   BoxLoopforall(hypre__tot, HYPRE_LAMBDA (HYPRE_Int idx)                                             \
-   {
-
-#define hypre_newBoxLoop0End()                                                                        \
-   });                                                                                                \
-}
-
 /* BoxLoop 1 */
 #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
 {                                                                                                     \
@@ -376,40 +363,24 @@ else                                                            \
 
 
 /* Reduction BoxLoop1 */
-#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum)                     \
+/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */
+/* Right now, it is hardcoded as a HYPRE_Real */
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var)                     \
 {                                                                                                                \
-   hypre_newBoxLoopInit(ndim, loop_size);                                                                        \
-   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                                        \
-   BoxLoopforall( [=] (sycl::nd_item<1> item, decltype(reducesum) &reducesum)                                                         \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                    \
+   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                                         \
    {                                                                                                  \
-      hypre_newBoxLoopDeclare(databox1);                                                                         \
-      hypre_BoxLoopIncK(1, databox1, i1);\
-      const HYPRE_Int thread_id = (HYPRE_Int) item.get_global_linear_id();\
-      const HYPRE_Int n_threads = (HYPRE_Int) item.get_global_range().size();\
-      for (HYPRE_Int idx = thread_id; idx < length; idx += n_threads)\
-      {
-
-#define hypre_newBoxLoop1ReductionEnd(i1, reducesum)                                                                \
-         reducer.BlockReduce();\
-      }                                                                                               \
-   }, hypre__tot);                                                                                    \
-}
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);
 
-/* Reduction BoxLoop2 */
-#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1,                                \
-                                                      dbox2, start2, stride2, i2, reducesum)                     \
-{                                                                                                                \
-   hypre_newBoxLoopInit(ndim, loop_size);                                                                        \
-   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                                        \
-   hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                                        \
-   ReductionBoxLoopforall(hypre__tot, reducesum, HYPRE_LAMBDA (HYPRE_Int idx, decltype(reducesum) &reducesum)    \
-   {                                                                                                             \
-      hypre_newBoxLoopDeclare(databox1);                                                                         \
-      hypre_BoxLoopIncK(1, databox1, i1);                                                                        \
-      hypre_BoxLoopIncK(2, databox2, i2);
-
-#define hypre_newBoxLoop2ReductionEnd(i1, i2, reducesum)                                                            \
-   });                                                                                                           \
+#define hypre_newBoxLoop1ReductionEnd(i1, sum_var)                                                                \
+      }                                                                                               \
+   }, hypre__tot, sum_buf);                                                                                    \
 }
 
 
@@ -667,8 +638,6 @@ else                                                            \
 
 #define hypre_BoxLoopBlock()       0
 
-#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
-#define hypre_BoxLoop0End        hypre_newBoxLoop0End
 #define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
 #define hypre_BoxLoop1End        hypre_newBoxLoop1End
 #define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index 3483361926..9d95075c3e 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -54,6 +54,7 @@
 #elif defined(HYPRE_USING_SYCL)
 
 #include <CL/sycl.hpp>
+/* WM: todo - include below as necessary */
 /* #include <oneapi/dpl/execution> */
 /* #include <oneapi/dpl/algorithm> */
 /* #include <oneapi/dpl/iterator> */
@@ -226,6 +227,7 @@ struct hypre_DeviceData
 #endif
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+   /* WM: question - what is the device_allocator? */
    hypre_device_allocator            device_allocator;
 #endif
 #if defined(HYPRE_USING_SYCL)
@@ -346,6 +348,8 @@ struct hypre_GpuMatData
 
 #endif //#if defined(HYPRE_USING_GPU)
 
+/* WM: todo - is this how I want to integrate the functionality below? Do I really need all this? */
+/* NOTE: It doesn't line up that nicely with the cuda/hip implementation since you need to pass item agrs */
 #if defined(HYPRE_USING_SYCL)
 /* return the number of work-items in current work-group */
 template <hypre_int dim>
@@ -506,6 +510,8 @@ using namespace thrust::placeholders;
 #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
 #elif defined(HYPRE_USING_HIP)
 #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() );  }
+#elif defined(HYPRE_USING_SYCL)
+/* WM: todo? used below in HYPRE_CUDA_LAUNCH2 */
 #endif
 #else // #if defined(HYPRE_DEBUG)
 #define GPU_LAUNCH_SYNC

From 2fb3f27fd3b8766a1bf077d32f4587b3abe84c03 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Mon, 4 Oct 2021 17:04:05 -0700
Subject: [PATCH 13/44] Reproducing seg fault when trying to launch trivial
 reduction parallel_for

---
 src/test/Makefile |  2 +-
 src/test/simple.c | 49 +++++++++++++++++++++++++++++++++++++++++------
 2 files changed, 44 insertions(+), 7 deletions(-)

diff --git a/src/test/Makefile b/src/test/Makefile
index f7f5d5431d..b5910211c6 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -140,7 +140,7 @@ ij: ij.o
 	${LINK_CC} -o $@ $< ${LFLAGS}
 
 # WM: TODO: remove
-simple: simple.o
+simple: simple.obj
 	@echo  "Building" $@ "... "
 	${LINK_CC} -o $@ $< ${LFLAGS}
 
diff --git a/src/test/simple.c b/src/test/simple.c
index a52260e1df..e385aefe69 100644
--- a/src/test/simple.c
+++ b/src/test/simple.c
@@ -1,7 +1,5 @@
 /* WM: todo - remove this file from git */
 
-#include "_hypre_utilities.h"
-#include "_hypre_utilities.hpp"
 #include "HYPRE.h"
 #include "_hypre_struct_mv.h"
 #include "_hypre_struct_mv.hpp"
@@ -145,10 +143,49 @@ main( hypre_int argc,
 {
 
    /* initialize */
-   /* hypre_MPI_Init(&argc, &argv); */
-   /* HYPRE_Init(); */
+   hypre_MPI_Init(&argc, &argv);
+   HYPRE_Init();
    /* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); */
 
+   HYPRE_Int length = 1000;
+   const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
+   const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+   HYPRE_Real *arr = hypre_CTAlloc(HYPRE_Real, length, HYPRE_MEMORY_DEVICE);
+   HYPRE_Real sum_var = 0;
+   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);
+
+   /* Reduction parallel_for with accessor */
+   std::cout << "Launching parallel_for reduction with accessor" << std::endl;
+   hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
+      {
+         sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write);
+
+         cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), 
+            [=] (sycl::nd_item<1> item, auto &sum) 
+               {
+                  /* trivial kernel */ 
+               });
+      }).wait_and_throw();
+
+
+
+
+   HYPRE_Real *sum_var_usm = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);
+
+   /* Reduction parallel_for with unified memory pointer */
+   std::cout << "Launching parallel_for reduction with unified memory pointer" << std::endl;
+   hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
+      {
+         cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_var_usm, sycl::ONEAPI::plus<>()), 
+            [=] (sycl::nd_item<1> item, auto &sum) 
+               {
+                  /* trivial kernel */ 
+               });
+      }).wait_and_throw();
+
+
+
+
 
    /* sycl::queue my_queue(sycl::default_selector{}, dpc_common::exception_handler); */
    /* ShowDevice(my_queue); */
@@ -159,8 +196,8 @@ main( hypre_int argc,
    /* hypre_printf("is_cpu = %d\n", gpu.is_cpu()); */
    /* hypre_printf("is_cpu = %d\n", dev.is_cpu()); */
    /* hypre_printf("is_gpu = %d\n", gpu.is_gpu()); */
-   /* hypre_printf("DONE\n"); */
-   /* exit(0); */
+   hypre_printf("DONE\n");
+   exit(0);
 
 
 

From 001fb9f5d15258034923aba09a909370e25118c4 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Tue, 5 Oct 2021 16:40:50 -0700
Subject: [PATCH 14/44] Reduction boxloops done

The reduction boxloops are implemented and pass the
struct solvers.sh tests. Cleanup of boxloop_sycl.h.
---
 src/struct_ls/HYPRE_struct_int.c   |   4 +-
 src/struct_ls/pfmg_setup.c         | 127 ++++++++++--
 src/struct_mv/_hypre_struct_mv.hpp |  33 ++-
 src/struct_mv/boxloop_sycl.h       | 320 ++++-------------------------
 src/struct_mv/struct_innerprod.c   |   6 +-
 src/test/simple.c                  | 156 +++++++++++---
 src/utilities/_hypre_utilities.hpp |  14 +-
 src/utilities/device_utils.c       |  13 +-
 src/utilities/device_utils.h       |   3 +-
 src/utilities/headers              |   5 +
 10 files changed, 332 insertions(+), 349 deletions(-)

diff --git a/src/struct_ls/HYPRE_struct_int.c b/src/struct_ls/HYPRE_struct_int.c
index abb1869fcd..e9048acbf7 100644
--- a/src/struct_ls/HYPRE_struct_int.c
+++ b/src/struct_ls/HYPRE_struct_int.c
@@ -71,9 +71,7 @@ hypre_StructVectorSetRandomValues( hypre_StructVector *vector,
       hypre_BoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
                           v_data_box, start, unit_stride, vi);
       {
-// WM: TODO: temporary fix... remove after sycl implementation is done
-#if defined(HYPRE_USING_SYCL)
-#elif defined(HYPRE_USING_GPU)
+#if defined(HYPRE_USING_GPU)
          vp[vi] = rand_device[idx];
 #else
          vp[vi] = 2.0*hypre_Rand() - 1.0;
diff --git a/src/struct_ls/pfmg_setup.c b/src/struct_ls/pfmg_setup.c
index fbf91d16d3..684824f26a 100644
--- a/src/struct_ls/pfmg_setup.c
+++ b/src/struct_ls/pfmg_setup.c
@@ -809,23 +809,18 @@ hypre_PFMGComputeDxyz( hypre_StructMatrix *A,
          switch (stencil_size)
          {
             case 5:
-               hypre_printf("WM: debug - stencil size = 5\n");
                hypre_PFMGComputeDxyz_SS5 (i, A, cxyz, sqcxyz);
                break;
             case 9:
-               hypre_printf("WM: debug - stencil size = 9\n");
                hypre_PFMGComputeDxyz_SS9 (i, A, cxyz, sqcxyz);
                break;
             case 7:
-               hypre_printf("WM: debug - stencil size = 7\n");
                hypre_PFMGComputeDxyz_SS7 (i, A, cxyz, sqcxyz);
                break;
             case 19:
-               hypre_printf("WM: debug - stencil size = 19\n");
                hypre_PFMGComputeDxyz_SS19(i, A, cxyz, sqcxyz);
                break;
             case 27:
-               hypre_printf("WM: debug - stencil size = 27\n");
                hypre_PFMGComputeDxyz_SS27(i, A, cxyz, sqcxyz);
                break;
             default:
@@ -1060,7 +1055,7 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int           bi,
 #if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL)
 
    HYPRE_Real cxb = cxyz[0];
-   hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+   hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
                                 A_dbox, start, stride, Ai, cxb)
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
@@ -1071,10 +1066,10 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int           bi,
       cxb += tcx;
 #endif
    }
-   hypre_newBoxLoop1ReductionEnd(Ai, cxb)
+   hypre_BoxLoop1ReductionEnd(Ai, cxb)
 
    HYPRE_Real cyb = cxyz[1];
-   hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+   hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
                                 A_dbox, start, stride, Ai, cyb)
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
@@ -1085,10 +1080,10 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int           bi,
       cyb += tcy;
 #endif
    }
-   hypre_newBoxLoop1ReductionEnd(Ai, cyb)
+   hypre_BoxLoop1ReductionEnd(Ai, cyb)
 
    HYPRE_Real sqcxb = sqcxyz[0];
-   hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+   hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
                                 A_dbox, start, stride, Ai, sqcxb)
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
@@ -1099,10 +1094,10 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int           bi,
       sqcxb += tcx * tcx;
 #endif
    }
-   hypre_newBoxLoop1ReductionEnd(Ai, sqcxb)
+   hypre_BoxLoop1ReductionEnd(Ai, sqcxb)
 
    HYPRE_Real sqcyb = sqcxyz[1];
-   hypre_newBoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
+   hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
                                 A_dbox, start, stride, Ai, sqcyb)
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
@@ -1113,7 +1108,7 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int           bi,
       sqcyb += tcy * tcy;
 #endif
    }
-   hypre_newBoxLoop1ReductionEnd(Ai, sqcyb)
+   hypre_BoxLoop1ReductionEnd(Ai, sqcyb)
 
 #else // #if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL)
 
@@ -1262,7 +1257,7 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi,
    a_cne = hypre_StructMatrixExtractPointerByIndex(A, bi, index);
 
    // FIXME TODO HOW TO DO KOKKOS IN ONE BOXLOOP ?
-#if defined(HYPRE_USING_KOKKOS)
+#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL)
 
    HYPRE_Real cxb = cxyz[0];
    hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
@@ -1270,7 +1265,11 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcx;
+#else
       cxb += tcx;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, cxb)
 
@@ -1280,7 +1279,11 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcy;
+#else
       cyb += tcy;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, cyb)
 
@@ -1290,7 +1293,11 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcx*tcx;
+#else
       sqcxb += tcx*tcx;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, sqcxb)
 
@@ -1300,7 +1307,11 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcy*tcy;
+#else
       sqcyb += tcy*tcy;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, sqcyb)
 
@@ -1437,7 +1448,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
    a_bc = hypre_StructMatrixExtractPointerByIndex(A, bi, index);
 
    // FIXME TODO HOW TO DO KOKKOS IN ONE BOXLOOP ?
-#if defined(HYPRE_USING_KOKKOS)
+#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL)
 
    HYPRE_Real cxb = cxyz[0];
    hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
@@ -1445,7 +1456,11 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcx;
+#else
       cxb += tcx;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, cxb)
 
@@ -1455,7 +1470,11 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcy;
+#else
       cyb += tcy;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, cyb)
 
@@ -1465,7 +1484,11 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcz;
+#else
       czb += tcz;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, czb)
 
@@ -1475,7 +1498,11 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcx*tcx;
+#else
       sqcxb += tcx*tcx;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, sqcxb)
 
@@ -1485,7 +1512,11 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcy*tcy;
+#else
       sqcyb += tcy*tcy;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, sqcyb)
 
@@ -1495,7 +1526,11 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcz*tcz;
+#else
       sqczb += tcz*tcz;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, sqczb)
 
@@ -1692,7 +1727,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
    a_cne = hypre_StructMatrixExtractPointerByIndex(A, bi, index);
 
    // FIXME TODO HOW TO DO KOKKOS IN ONE BOXLOOP ?
-#if defined(HYPRE_USING_KOKKOS)
+#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL)
 
    HYPRE_Real cxb = cxyz[0];
    hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
@@ -1700,7 +1735,11 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
    {
        HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
        HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_aw[Ai] + a_ae[Ai] + a_bw[Ai] + a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+       sum += tcx;
+#else
        cxb += tcx;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, cxb)
 
@@ -1710,7 +1749,11 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcy;
+#else
       cyb += tcy;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, cyb)
 
@@ -1720,7 +1763,11 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] +  a_bw[Ai]  + a_be[Ai] +  a_bn[Ai] +  a_bs[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcz;
+#else
       czb += tcz;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, czb)
 
@@ -1730,7 +1777,11 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_aw[Ai] + a_ae[Ai] + a_bw[Ai] + a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcx*tcx;
+#else
       sqcxb += tcx*tcx;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, sqcxb)
 
@@ -1740,7 +1791,11 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcy*tcy;
+#else
       sqcyb += tcy*tcy;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, sqcyb)
 
@@ -1750,7 +1805,11 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
    {
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] +  a_bw[Ai]  + a_be[Ai] +  a_bn[Ai] +  a_bs[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcz*tcz;
+#else
       sqczb += tcz*tcz;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, sqczb)
 
@@ -1988,7 +2047,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
    a_bne = hypre_StructMatrixExtractPointerByIndex(A, bi, index);
 
    // FIXME TODO HOW TO DO KOKKOS IN ONE BOXLOOP ?
-#if defined(HYPRE_USING_KOKKOS)
+#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL)
 
    HYPRE_Real cxb = cxyz[0];
    hypre_BoxLoop1ReductionBegin(hypre_StructMatrixNDim(A), loop_size,
@@ -1998,7 +2057,11 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       tcx -= diag * (a_cw[Ai]  + a_ce[Ai]  +  a_aw[Ai] +  a_ae[Ai] +  a_bw[Ai] +  a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
       tcx -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcx;
+#else
       cxb += tcx;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, cxb)
 
@@ -2010,7 +2073,11 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       tcy -= diag * (a_cs[Ai]  + a_cn[Ai]  +  a_an[Ai] +  a_as[Ai] +  a_bn[Ai] +  a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
       tcy -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcy;
+#else
       cyb += tcy;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, cyb)
 
@@ -2022,7 +2089,11 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       tcz -= diag * (a_ac[Ai]  +  a_bc[Ai] +  a_aw[Ai] +  a_ae[Ai] +  a_an[Ai] +  a_as[Ai] +  a_bw[Ai] +  a_be[Ai] + a_bn[Ai] + a_bs[Ai]);
       tcz -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcz;
+#else
       czb += tcz;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, czb)
 
@@ -2034,7 +2105,11 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       tcx -= diag * (a_cw[Ai]  + a_ce[Ai]  +  a_aw[Ai] +  a_ae[Ai] +  a_bw[Ai] +  a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
       tcx -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcx*tcx;
+#else
       sqcxb += tcx*tcx;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, sqcxb)
 
@@ -2046,7 +2121,11 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       tcy -= diag * (a_cs[Ai]  + a_cn[Ai]  +  a_an[Ai] +  a_as[Ai] +  a_bn[Ai] +  a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
       tcy -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcy*tcy;
+#else
       sqcyb += tcy*tcy;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, sqcyb);
 
@@ -2058,7 +2137,11 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       tcz -= diag * (a_ac[Ai]  +  a_bc[Ai] +  a_aw[Ai] +  a_ae[Ai] +  a_an[Ai] +  a_as[Ai] +  a_bw[Ai] +  a_be[Ai] + a_bn[Ai] + a_bs[Ai]);
       tcz -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]);
+#if defined(HYPRE_USING_SYCL)
+      sum += tcz*tcz;
+#else
       sqczb += tcz*tcz;
+#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, sqczb)
 
@@ -2198,7 +2281,7 @@ hypre_ZeroDiagonal( hypre_StructMatrix *A )
       }
       else
       {
-#if defined(HYPRE_USING_KOKKOS)
+#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL)
          HYPRE_Real diag_product_local = diag_product;
 #elif defined(HYPRE_USING_RAJA)
          ReduceSum<hypre_raja_reduce_policy, HYPRE_Real> diag_product_local(diag_product);
@@ -2226,11 +2309,19 @@ hypre_ZeroDiagonal( hypre_StructMatrix *A )
             HYPRE_Real zero = 0.0;
             if (Ap[Ai] == 0.0)
             {
+#if defined(HYPRE_USING_SYCL)
+               sum += one;
+#else
                diag_product_local += one;
+#endif
             }
             else
             {
+#if defined(HYPRE_USING_SYCL)
+               sum += zero;
+#else
                diag_product_local += zero;
+#endif
             }
          }
          hypre_BoxLoop1ReductionEnd(Ai, diag_product_local);
diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index 13982b41cb..86aded1b93 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1535,8 +1535,29 @@ else                                                            \
    }, hypre__tot, sum_buf);                                                                                    \
 }
 
+/* Reduction BoxLoop2 */
+/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */
+/* Right now, it is hardcoded as a HYPRE_Real */
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1,                                \
+                                                      dbox2, start2, stride2, i2, sum_var)                     \
+{                                                                                                                \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                                        \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                                        \
+   hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                                        \
+   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                    \
+   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                                         \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                                         \
+         hypre_BoxLoopIncK(1, databox1, i1);                                                                        \
+         hypre_BoxLoopIncK(2, databox2, i2);
 
-
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var)                                                                \
+      }                                                                                               \
+   }, hypre__tot, sum_buf);                                                                                    \
+}
 
 
 
@@ -1805,18 +1826,18 @@ else                                                            \
 /* Reduction */
 /* WM: todo - using CPU version for now */
 #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \
-        zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)
+        hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum)
 
 #define hypre_BoxLoop1ReductionEnd(i1, reducesum) \
-        zypre_newBoxLoop1End(i1)
+        hypre_newBoxLoop1ReductionEnd(i1, reducesum)
 
 #define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \
                                                       dbox2, start2, stride2, i2, reducesum) \
-        zypre_newBoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \
-                                             dbox2, start2, stride2, i2)
+        hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \
+                                                         dbox2, start2, stride2, i2, reducesum)
 
 #define hypre_BoxLoop2ReductionEnd(i1, i2, reducesum) \
-        zypre_newBoxLoop2End(i1, i2)
+        hypre_newBoxLoop2ReductionEnd(i1, i2, reducesum)
 
 #endif
 
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index 0a4fae81f0..1c44ee3e08 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -106,24 +106,11 @@ ReductionBoxLoopforall( LOOP_BODY  loop_body,
    }
 }
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 #ifdef __cplusplus
 }
 #endif
 
+
 /*********************************************************************
  * Init/Declare/IncK etc.
  *********************************************************************/
@@ -211,7 +198,7 @@ else                                                            \
 /* WM: todo - double check that item.get_local_id(0) is actually what you want below */
 #define hypre_newBoxLoopDeclare(box)                     \
    hypre_Index local_idx;                                \
-   HYPRE_Int idx_local = idx;              \
+   HYPRE_Int idx_local = idx;                            \
    hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
    idx_local = idx_local / box.lsize0;                   \
    hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
@@ -236,7 +223,6 @@ else                                                            \
    index[2] = hypre_IndexD(local_idx, 2);
 
 
-
 /*********************************************************************
  * Boxloops
  *********************************************************************/
@@ -248,7 +234,7 @@ else                                                            \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -268,7 +254,7 @@ else                                                            \
    hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -291,7 +277,7 @@ else                                                            \
    hypre_BoxLoopDataDeclareK(3, ndim,loop_size, dbox3, start3, stride3);                              \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -317,7 +303,7 @@ else                                                            \
    hypre_BoxLoopDataDeclareK(4, ndim, loop_size, dbox4, start4, stride4);                             \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -340,7 +326,7 @@ else                                                            \
    hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1);                                       \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -354,7 +340,7 @@ else                                                            \
    hypre_BasicBoxLoopDataDeclareK(2, ndim, loop_size, stride2);                                       \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -365,273 +351,49 @@ else                                                            \
 /* Reduction BoxLoop1 */
 /* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */
 /* Right now, it is hardcoded as a HYPRE_Real */
-#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var)                     \
-{                                                                                                                \
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var)         \
+{                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                    \
-   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                                         \
+   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                                     \
+   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                     \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
          hypre_BoxLoopIncK(1, databox1, i1);
 
-#define hypre_newBoxLoop1ReductionEnd(i1, sum_var)                                                                \
+#define hypre_newBoxLoop1ReductionEnd(i1, sum_var)                                                    \
       }                                                                                               \
-   }, hypre__tot, sum_buf);                                                                                    \
+   }, hypre__tot, sum_buf);                                                                           \
 }
 
+/* Reduction BoxLoop2 */
+/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */
+/* Right now, it is hardcoded as a HYPRE_Real */
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1,                  \
+                                                      dbox2, start2, stride2, i2, sum_var)            \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
+   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                                     \
+   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                     \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);                                                          \
+         hypre_BoxLoopIncK(2, databox2, i2);
 
-
-
-
-
-
-
-
-
-/*********************************************************************
- * HOST IMPLEMENTATION
- *********************************************************************/
-
-#ifdef HYPRE_USING_OPENMP
-#define HYPRE_BOX_REDUCTION
-#if defined(WIN32) && defined(_MSC_VER)
-#define Pragma(x) __pragma(HYPRE_XSTR(x))
-#else
-#define Pragma(x) _Pragma(HYPRE_XSTR(x))
-#endif
-#define OMP0 Pragma(omp parallel for HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE)
-#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE)
-#else /* #ifdef HYPRE_USING_OPENMP */
-#define OMP0
-#define OMP1
-#endif /* #ifdef HYPRE_USING_OPENMP */
-
-#define zypre_newBoxLoop0Begin(ndim, loop_size)                               \
-{                                                                             \
-   zypre_BoxLoopDeclare();                                                    \
-   zypre_BoxLoopInit(ndim, loop_size);                                        \
-   OMP1                                                                       \
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
-   {                                                                          \
-      zypre_BoxLoopSet();                                                     \
-      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
-      {                                                                       \
-         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
-         {
-
-#define zypre_newBoxLoop0End()                                                \
-         }                                                                    \
-         zypre_BoxLoopInc1();                                                 \
-         zypre_BoxLoopInc2();                                                 \
-      }                                                                       \
-   }                                                                          \
-}
-
-#define zypre_newBoxLoop1Begin(ndim, loop_size,                               \
-                               dbox1, start1, stride1, i1)                    \
-{                                                                             \
-   HYPRE_Int i1;                                                              \
-   zypre_BoxLoopDeclare();                                                    \
-   zypre_BoxLoopDeclareK(1);                                                  \
-   zypre_BoxLoopInit(ndim, loop_size);                                        \
-   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
-   OMP1                                                                       \
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
-   {                                                                          \
-      HYPRE_Int i1;                                                           \
-      zypre_BoxLoopSet();                                                     \
-      zypre_BoxLoopSetK(1, i1);                                               \
-      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
-      {                                                                       \
-         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
-         {
-
-#define zypre_newBoxLoop1End(i1)                                              \
-            i1 += hypre__i0inc1;                                              \
-         }                                                                    \
-         zypre_BoxLoopInc1();                                                 \
-         i1 += hypre__ikinc1[hypre__d];                                       \
-         zypre_BoxLoopInc2();                                                 \
-      }                                                                       \
-   }                                                                          \
-}
-
-
-#define zypre_newBoxLoop2Begin(ndim, loop_size,                               \
-                               dbox1, start1, stride1, i1,                    \
-                               dbox2, start2, stride2, i2)                    \
-{                                                                             \
-   HYPRE_Int i1, i2;                                                          \
-   zypre_BoxLoopDeclare();                                                    \
-   zypre_BoxLoopDeclareK(1);                                                  \
-   zypre_BoxLoopDeclareK(2);                                                  \
-   zypre_BoxLoopInit(ndim, loop_size);                                        \
-   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
-   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);                         \
-   OMP1                                                                       \
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
-   {                                                                          \
-      HYPRE_Int i1, i2;                                                       \
-      zypre_BoxLoopSet();                                                     \
-      zypre_BoxLoopSetK(1, i1);                                               \
-      zypre_BoxLoopSetK(2, i2);                                               \
-      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
-      {                                                                       \
-         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
-         {
-
-#define zypre_newBoxLoop2End(i1, i2)                                          \
-            i1 += hypre__i0inc1;                                              \
-            i2 += hypre__i0inc2;                                              \
-         }                                                                    \
-         zypre_BoxLoopInc1();                                                 \
-         i1 += hypre__ikinc1[hypre__d];                                       \
-         i2 += hypre__ikinc2[hypre__d];                                       \
-         zypre_BoxLoopInc2();                                                 \
-      }                                                                       \
-   }                                                                          \
-}
-
-
-#define zypre_newBoxLoop3Begin(ndim, loop_size,                               \
-                               dbox1, start1, stride1, i1,                    \
-                               dbox2, start2, stride2, i2,                    \
-                               dbox3, start3, stride3, i3)                    \
-{                                                                             \
-   HYPRE_Int i1, i2, i3;                                                      \
-   zypre_BoxLoopDeclare();                                                    \
-   zypre_BoxLoopDeclareK(1);                                                  \
-   zypre_BoxLoopDeclareK(2);                                                  \
-   zypre_BoxLoopDeclareK(3);                                                  \
-   zypre_BoxLoopInit(ndim, loop_size);                                        \
-   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
-   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);                         \
-   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);                         \
-   OMP1                                                                       \
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
-   {                                                                          \
-      HYPRE_Int i1, i2, i3;                                                   \
-      zypre_BoxLoopSet();                                                     \
-      zypre_BoxLoopSetK(1, i1);                                               \
-      zypre_BoxLoopSetK(2, i2);                                               \
-      zypre_BoxLoopSetK(3, i3);                                               \
-      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
-      {                                                                       \
-         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
-         {
-
-#define zypre_newBoxLoop3End(i1, i2, i3)                                      \
-            i1 += hypre__i0inc1;                                              \
-            i2 += hypre__i0inc2;                                              \
-            i3 += hypre__i0inc3;                                              \
-         }                                                                    \
-         zypre_BoxLoopInc1();                                                 \
-         i1 += hypre__ikinc1[hypre__d];                                       \
-         i2 += hypre__ikinc2[hypre__d];                                       \
-         i3 += hypre__ikinc3[hypre__d];                                       \
-         zypre_BoxLoopInc2();                                                 \
-      }                                                                       \
-   }                                                                          \
-}
-
-#define zypre_newBoxLoop4Begin(ndim, loop_size,                               \
-                            dbox1, start1, stride1, i1,                       \
-                            dbox2, start2, stride2, i2,                       \
-                            dbox3, start3, stride3, i3,                       \
-                            dbox4, start4, stride4, i4)                       \
-{                                                                             \
-   HYPRE_Int i1, i2, i3, i4;                                                  \
-   zypre_BoxLoopDeclare();                                                    \
-   zypre_BoxLoopDeclareK(1);                                                  \
-   zypre_BoxLoopDeclareK(2);                                                  \
-   zypre_BoxLoopDeclareK(3);                                                  \
-   zypre_BoxLoopDeclareK(4);                                                  \
-   zypre_BoxLoopInit(ndim, loop_size);                                        \
-   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
-   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);                         \
-   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);                         \
-   zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4);                         \
-   OMP1                                                                       \
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
-   {                                                                          \
-      HYPRE_Int i1, i2, i3, i4;                                               \
-      zypre_BoxLoopSet();                                                     \
-      zypre_BoxLoopSetK(1, i1);                                               \
-      zypre_BoxLoopSetK(2, i2);                                               \
-      zypre_BoxLoopSetK(3, i3);                                               \
-      zypre_BoxLoopSetK(4, i4);                                               \
-      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
-      {                                                                       \
-         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
-         {
-
-#define zypre_newBoxLoop4End(i1, i2, i3, i4)                                  \
-            i1 += hypre__i0inc1;                                              \
-            i2 += hypre__i0inc2;                                              \
-            i3 += hypre__i0inc3;                                              \
-            i4 += hypre__i0inc4;                                              \
-         }                                                                    \
-         zypre_BoxLoopInc1();                                                 \
-         i1 += hypre__ikinc1[hypre__d];                                       \
-         i2 += hypre__ikinc2[hypre__d];                                       \
-         i3 += hypre__ikinc3[hypre__d];                                       \
-         i4 += hypre__ikinc4[hypre__d];                                       \
-         zypre_BoxLoopInc2();                                                 \
-      }                                                                       \
-   }                                                                          \
-}
-
-#define zypre_newBasicBoxLoop2Begin(ndim, loop_size,                          \
-                                    stride1, i1,                              \
-                                    stride2, i2)                              \
-{                                                                             \
-   zypre_BoxLoopDeclare();                                                    \
-   zypre_BoxLoopDeclareK(1);                                                  \
-   zypre_BoxLoopDeclareK(2);                                                  \
-   zypre_BoxLoopInit(ndim, loop_size);                                        \
-   zypre_BasicBoxLoopInitK(1, stride1);                                       \
-   zypre_BasicBoxLoopInitK(2, stride2);                                       \
-   OMP1                                                                       \
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
-   {                                                                          \
-      HYPRE_Int i1, i2;                                                       \
-      zypre_BoxLoopSet();                                                     \
-      zypre_BoxLoopSetK(1, i1);                                               \
-      zypre_BoxLoopSetK(2, i2);                                               \
-      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
-      {                                                                       \
-         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
-         {
-
-
-#define hypre_LoopBegin(size, idx)                                            \
-{                                                                             \
-   HYPRE_Int idx;                                                             \
-   OMP0                                                                       \
-   for (idx = 0; idx < size; idx ++)                                          \
-   {
-
-#define hypre_LoopEnd()                                                       \
-  }                                                                           \
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var)                                                \
+      }                                                                                               \
+   }, hypre__tot, sum_buf);                                                                           \
 }
 
 
-
-
-
-
-
-
-
-
-
-
-
-
 /*********************************************************************
  * renamings
  *********************************************************************/
@@ -653,17 +415,17 @@ else                                                            \
 /* Reduction */
 /* WM: todo - using CPU version for now */
 #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \
-        zypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)
+        hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum)
 
 #define hypre_BoxLoop1ReductionEnd(i1, reducesum) \
-        zypre_newBoxLoop1End(i1)
+        hypre_newBoxLoop1ReductionEnd(i1, reducesum)
 
 #define hypre_BoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \
                                                       dbox2, start2, stride2, i2, reducesum) \
-        zypre_newBoxLoop2Begin(ndim, loop_size, dbox1, start1, stride1, i1, \
-                                             dbox2, start2, stride2, i2)
+        hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, \
+                                                         dbox2, start2, stride2, i2, reducesum)
 
 #define hypre_BoxLoop2ReductionEnd(i1, i2, reducesum) \
-        zypre_newBoxLoop2End(i1, i2)
+        hypre_newBoxLoop2ReductionEnd(i1, i2, reducesum)
 
 #endif
diff --git a/src/struct_mv/struct_innerprod.c b/src/struct_mv/struct_innerprod.c
index 497cd4280a..cfef661cb0 100644
--- a/src/struct_mv/struct_innerprod.c
+++ b/src/struct_mv/struct_innerprod.c
@@ -62,7 +62,7 @@ hypre_StructInnerProd( hypre_StructVector *x,
 
       hypre_BoxGetSize(box, loop_size);
 
-#if defined(HYPRE_USING_KOKKOS)
+#if defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_SYCL)
       HYPRE_Real box_sum = 0.0;
 #elif defined(HYPRE_USING_RAJA)
       ReduceSum<hypre_raja_reduce_policy, HYPRE_Real> box_sum(0.0);
@@ -89,7 +89,11 @@ hypre_StructInnerProd( hypre_StructVector *x,
                                    box_sum)
       {
          HYPRE_Real tmp = xp[xi] * hypre_conj(yp[yi]);
+#if defined(HYPRE_USING_SYCL)
+         sum += tmp;
+#else
          box_sum += tmp;
+#endif
       }
       hypre_BoxLoop2ReductionEnd(xi, yi, box_sum);
 
diff --git a/src/test/simple.c b/src/test/simple.c
index e385aefe69..ff5e40b103 100644
--- a/src/test/simple.c
+++ b/src/test/simple.c
@@ -141,47 +141,135 @@ hypre_int
 main( hypre_int argc,
       char *argv[] )
 {
+   /* hypre_MPI_Init(&argc, &argv); */
+   /* HYPRE_Init(); */
+   /* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); */
+
+
+   /* return 0; */
+
+/******************************************************************************/
+/******************************************************************************/
+
+   /* Get device */
+   /* sycl::device   syclDev = sycl::device(sycl::default_selector{}); */
+
+   /* /1* Get asynchandler *1/ */
+   /* auto sycl_asynchandler = [] (sycl::exception_list exceptions) */ 
+   /* { */
+   /*    for (std::exception_ptr const& e : exceptions) */ 
+   /*    { */
+   /*       try */
+   /*       { */
+   /*          std::rethrow_exception(e); */
+   /*       } */
+   /*       catch (sycl::exception const& ex) */
+   /*       { */
+   /*          std::cout << "Caught asynchronous SYCL exception:" << std::endl */
+   /*          << ex.what() << ", OpenCL code: " << ex.get_cl_code() << std::endl; */
+   /*       } */
+   /*    } */
+   /* }; */
+
+   /* /1* Setup sycl context *1/ */
+   /* sycl::context  syclctxt  = sycl::context(syclDev, sycl_asynchandler); */
+
+   /* /1* Setup queue *1/ */
+   /* sycl::queue *my_queue = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); */
+
+   /* /1* Show device associated with queue *1/ */
+   /* ShowDevice(*my_queue); */
+
+   /* return 0; */
+
+
+
+/******************************************************************************/
+/******************************************************************************/
+
+    int length = 1024;
+ 
+    sycl::default_selector selector;
+    sycl::queue myq(selector);
+    std::cout<<"Running on: "<<myq.get_device().get_info<sycl::info::device::name>()<<"\n";
+ 
+    auto A = sycl::malloc_shared<float>(length, myq);
+ 
+    auto gr = sycl::range<1>(length);
+    auto lr = sycl::range<1>(32); //change me, too small?
+ 
+ 
+    for(int i=0;i<length;i++) A[i] = static_cast<float>(i+1);  //initialize
+ 
+    //MAKE SURE I"M HOST & DEVICE ACCESSIBLE!
+    auto fsum = sycl::malloc_shared<float>(1, myq);
+ 
+    {
+    myq.submit( [&](auto &h) {
+        /* auto properties = sycl::property::reduction::initialize_to_identity{}; */
+        h.parallel_for(sycl::nd_range<1>(gr,lr),
+            sycl::ONEAPI::reduction(fsum, std::plus<>()),
+            [=](sycl::nd_item<1> it, auto &sum){
+                int i = it.get_global_id(0);
+                sum += A[i];
+            });
+    }).wait_and_throw();
+    }
+ 
+    printf("sum: %f\n",fsum[0]);
+    return 0;
+
+
+/******************************************************************************/
+/******************************************************************************/
+
+
+
 
    /* initialize */
-   hypre_MPI_Init(&argc, &argv);
-   HYPRE_Init();
-   /* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); */
+   /* hypre_MPI_Init(&argc, &argv); */
+   /* HYPRE_Init(); */
+   /* /1* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); *1/ */
 
-   HYPRE_Int length = 1000;
-   const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
-   const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
-   HYPRE_Real *arr = hypre_CTAlloc(HYPRE_Real, length, HYPRE_MEMORY_DEVICE);
-   HYPRE_Real sum_var = 0;
-   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);
+   /* HYPRE_Int length = 1000; */
+   /* const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); */
+   /* const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); */
+   /* HYPRE_Real *arr = hypre_CTAlloc(HYPRE_Real, length, HYPRE_MEMORY_DEVICE); */
+   /* HYPRE_Real sum_var = 0; */
+   /* /1* sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1); *1/ */
+   /* sycl::buffer<HYPRE_Real> sum_buf{&sum_var, 1}; */
 
-   /* Reduction parallel_for with accessor */
-   std::cout << "Launching parallel_for reduction with accessor" << std::endl;
-   hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
-      {
-         sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write);
+   /* /1* Reduction parallel_for with accessor *1/ */
+   /* std::cout << "Launching parallel_for reduction with accessor" << std::endl; */
+   /* hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) */
+   /*    { */
+   /*       sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write); */
+   /*       /1* auto sumReduction = sycl::reduction(sum_buf, cgh, sycl::plus<>()); *1/ */
 
-         cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), 
-            [=] (sycl::nd_item<1> item, auto &sum) 
-               {
-                  /* trivial kernel */ 
-               });
-      }).wait_and_throw();
+   /*       /1* WM: NOTE - on JLSE, ONEAPI is marked as deprecated to be replaced by ext::oneapi *1/ */
+   /*       cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), */ 
+   /*       /1* cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sumReduction, *1/ */ 
+   /*          [=] (sycl::nd_item<1> item, auto &sum) */ 
+   /*             { */
+   /*                /1* trivial kernel *1/ */ 
+   /*             }); */
+   /*    }).wait_and_throw(); */
 
 
 
 
-   HYPRE_Real *sum_var_usm = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);
+/*    HYPRE_Real *sum_var_usm = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); */
 
-   /* Reduction parallel_for with unified memory pointer */
-   std::cout << "Launching parallel_for reduction with unified memory pointer" << std::endl;
-   hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
-      {
-         cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_var_usm, sycl::ONEAPI::plus<>()), 
-            [=] (sycl::nd_item<1> item, auto &sum) 
-               {
-                  /* trivial kernel */ 
-               });
-      }).wait_and_throw();
+/*    /1* Reduction parallel_for with unified memory pointer *1/ */
+/*    std::cout << "Launching parallel_for reduction with unified memory pointer" << std::endl; */
+/*    hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) */
+/*       { */
+/*          cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_var_usm, sycl::ONEAPI::plus<>()), */ 
+/*             [=] (sycl::nd_item<1> item, auto &sum) */ 
+/*                { */
+/*                   /1* trivial kernel *1/ */ 
+/*                }); */
+/*       }).wait_and_throw(); */
 
 
 
@@ -196,9 +284,11 @@ main( hypre_int argc,
    /* hypre_printf("is_cpu = %d\n", gpu.is_cpu()); */
    /* hypre_printf("is_cpu = %d\n", dev.is_cpu()); */
    /* hypre_printf("is_gpu = %d\n", gpu.is_gpu()); */
-   hypre_printf("DONE\n");
-   exit(0);
+   /* hypre_printf("DONE\n"); */
+   /* exit(0); */
 
+/******************************************************************************/
+/******************************************************************************/
 
 
    /* variables */
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index 61e8ae0998..6fe8451f0f 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -4,6 +4,11 @@
 #ifndef hypre_UTILITIES_HPP
 #define hypre_UTILITIES_HPP
 
+/* WM: todo - I have a problem where I need to include this outside the extern "C++" {} block, so I'm doing this manually here for now */
+#if defined(HYPRE_USING_SYCL)
+#include <CL/sycl.hpp>
+#endif
+
 #ifdef __cplusplus
 extern "C++" {
 #endif
@@ -105,7 +110,9 @@ struct hypre_device_allocator
 
 #elif defined(HYPRE_USING_SYCL)
 
-#include <CL/sycl.hpp>
+/* WM: todo - if the include for CL/sycl.hpp is inside extern "C++" {}, I get problems with sycl reductions... totally strange, but true */
+/* #include <CL/sycl.hpp> */
+/* WM: todo - include below as necessary */
 /* #include <oneapi/dpl/execution> */
 /* #include <oneapi/dpl/algorithm> */
 /* #include <oneapi/dpl/iterator> */
@@ -278,6 +285,7 @@ struct hypre_DeviceData
 #endif
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+   /* WM: question - what is the device_allocator? */
    hypre_device_allocator            device_allocator;
 #endif
 #if defined(HYPRE_USING_SYCL)
@@ -398,6 +406,8 @@ struct hypre_GpuMatData
 
 #endif //#if defined(HYPRE_USING_GPU)
 
+/* WM: todo - is this how I want to integrate the functionality below? Do I really need all this? */
+/* NOTE: It doesn't line up that nicely with the cuda/hip implementation since you need to pass item agrs */
 #if defined(HYPRE_USING_SYCL)
 /* return the number of work-items in current work-group */
 template <hypre_int dim>
@@ -558,6 +568,8 @@ using namespace thrust::placeholders;
 #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
 #elif defined(HYPRE_USING_HIP)
 #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() );  }
+#elif defined(HYPRE_USING_SYCL)
+/* WM: todo? used below in HYPRE_CUDA_LAUNCH2 */
 #endif
 #else // #if defined(HYPRE_DEBUG)
 #define GPU_LAUNCH_SYNC
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index 5b028a0ca1..0437d65175 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -9,7 +9,6 @@
 #include "_hypre_utilities.hpp"
 
 #if defined(HYPRE_USING_SYCL)
-#include <CL/sycl.hpp>
 // WM: TODO: verify
 sycl::range<1> hypre_GetDefaultCUDABlockDimension()
 {
@@ -975,10 +974,10 @@ hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i)
       };
 
       /* WM: having trouble with getting the device on frank, so temporarily just passing the default selector */
-      /* sycl::device   syclDev   = data->device; */
-      /* sycl::context  syclctxt  = sycl::context(syclDev, sycl_asynchandler); */
-      /* stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); */
-      stream = new sycl::queue(sycl::default_selector{}, sycl::property_list{sycl::property::queue::in_order{}});
+      sycl::device   syclDev   = data->device;
+      sycl::context  syclctxt  = sycl::context(syclDev, sycl_asynchandler);
+      stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}});
+      /* stream = new sycl::queue(sycl::default_selector{}, sycl::property_list{sycl::property::queue::in_order{}}); */
       data->streams[i] = stream;
    }
 #endif
@@ -1235,8 +1234,8 @@ hypre_DeviceDataCreate()
    hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST);
 
 #if defined(HYPRE_USING_SYCL)
-   /* WM: commenting out for now since I'm having trouble finding the device on frank */
-   /* hypre_DeviceDataDevice(data)            = sycl::device(sycl::gpu_selector{}); */
+   /* WM: does the default selector get a GPU if available? */
+   hypre_DeviceDataDevice(data)            = sycl::device(sycl::default_selector{});
 #else
    hypre_DeviceDataDevice(data)            = 0;
 #endif
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index 9d95075c3e..a442f2229f 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -53,7 +53,8 @@
 
 #elif defined(HYPRE_USING_SYCL)
 
-#include <CL/sycl.hpp>
+/* WM: todo - if the include for CL/sycl.hpp is inside extern "C++" {}, I get problems with sycl reductions... totally strange, but true */
+/* #include <CL/sycl.hpp> */
 /* WM: todo - include below as necessary */
 /* #include <oneapi/dpl/execution> */
 /* #include <oneapi/dpl/algorithm> */
diff --git a/src/utilities/headers b/src/utilities/headers
index 6d54d6d434..d3a0e28dba 100755
--- a/src/utilities/headers
+++ b/src/utilities/headers
@@ -79,6 +79,11 @@ cat > $INTERNAL_HEADER <<@
 #ifndef hypre_UTILITIES_HPP
 #define hypre_UTILITIES_HPP
 
+/* WM: todo - I have a problem where I need to include this outside the extern "C++" {} block, so I'm doing this manually here for now */
+#if defined(HYPRE_USING_SYCL)
+#include <CL/sycl.hpp>
+#endif
+
 #ifdef __cplusplus
 extern "C++" {
 #endif

From df301df9bcc5c197f221e334089125a803d410cc Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Tue, 5 Oct 2021 17:00:13 -0700
Subject: [PATCH 15/44] Cleanup

---
 src/configure                      |   1 -
 src/struct_mv/_hypre_struct_mv.hpp | 317 ++------------
 src/struct_mv/boxloop_sycl.h       |   2 -
 src/test/Makefile                  |   5 -
 src/test/TEST_ij/solvers.jobs      | 145 ++++---
 src/test/simple.c                  | 642 -----------------------------
 src/utilities/device_utils.c       |   7 +-
 src/utilities/general.c            |  40 +-
 src/utilities/memory.c             |   7 -
 9 files changed, 122 insertions(+), 1044 deletions(-)
 delete mode 100644 src/test/simple.c

diff --git a/src/configure b/src/configure
index 7b8443595b..bb48dbdf9b 100755
--- a/src/configure
+++ b/src/configure
@@ -9081,7 +9081,6 @@ fi
 
 if test x"$hypre_using_sycl" == x"yes"; then :
 
-# WM: not setting this with sycl for now since it is giving me problems
 $as_echo "#define HYPRE_USING_GPU 1" >>confdefs.h
 
 $as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h
diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index 86aded1b93..e4824ec744 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1258,24 +1258,11 @@ ReductionBoxLoopforall( LOOP_BODY  loop_body,
    }
 }
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 #ifdef __cplusplus
 }
 #endif
 
+
 /*********************************************************************
  * Init/Declare/IncK etc.
  *********************************************************************/
@@ -1360,10 +1347,9 @@ else                                                            \
 }
 
 /* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */
-/* WM: todo - double check that item.get_local_id(0) is actually what you want below */
 #define hypre_newBoxLoopDeclare(box)                     \
    hypre_Index local_idx;                                \
-   HYPRE_Int idx_local = idx;              \
+   HYPRE_Int idx_local = idx;                            \
    hypre_IndexD(local_idx, 0)  = idx_local % box.lsize0; \
    idx_local = idx_local / box.lsize0;                   \
    hypre_IndexD(local_idx, 1)  = idx_local % box.lsize1; \
@@ -1388,7 +1374,6 @@ else                                                            \
    index[2] = hypre_IndexD(local_idx, 2);
 
 
-
 /*********************************************************************
  * Boxloops
  *********************************************************************/
@@ -1400,7 +1385,7 @@ else                                                            \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -1420,7 +1405,7 @@ else                                                            \
    hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -1443,7 +1428,7 @@ else                                                            \
    hypre_BoxLoopDataDeclareK(3, ndim,loop_size, dbox3, start3, stride3);                              \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -1469,7 +1454,7 @@ else                                                            \
    hypre_BoxLoopDataDeclareK(4, ndim, loop_size, dbox4, start4, stride4);                             \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -1492,7 +1477,7 @@ else                                                            \
    hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1);                                       \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -1506,7 +1491,7 @@ else                                                            \
    hypre_BasicBoxLoopDataDeclareK(2, ndim, loop_size, stride2);                                       \
    BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
@@ -1517,294 +1502,49 @@ else                                                            \
 /* Reduction BoxLoop1 */
 /* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */
 /* Right now, it is hardcoded as a HYPRE_Real */
-#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var)                     \
-{                                                                                                                \
+#define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var)         \
+{                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                    \
-   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                                         \
+   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                                     \
+   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                     \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
          hypre_newBoxLoopDeclare(databox1);                                                           \
          hypre_BoxLoopIncK(1, databox1, i1);
 
-#define hypre_newBoxLoop1ReductionEnd(i1, sum_var)                                                                \
+#define hypre_newBoxLoop1ReductionEnd(i1, sum_var)                                                    \
       }                                                                                               \
-   }, hypre__tot, sum_buf);                                                                                    \
+   }, hypre__tot, sum_buf);                                                                           \
 }
 
 /* Reduction BoxLoop2 */
 /* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */
 /* Right now, it is hardcoded as a HYPRE_Real */
-#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1,                                \
-                                                      dbox2, start2, stride2, i2, sum_var)                     \
-{                                                                                                                \
-   hypre_newBoxLoopInit(ndim, loop_size);                                                                        \
-   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                                        \
-   hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                                        \
-   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                    \
-   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                                         \
+#define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1,                  \
+                                                      dbox2, start2, stride2, i2, sum_var)            \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
+   hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
+   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                                     \
+   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                     \
    {                                                                                                  \
-      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                                       \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
       {                                                                                               \
-         hypre_newBoxLoopDeclare(databox1);                                                                         \
-         hypre_BoxLoopIncK(1, databox1, i1);                                                                        \
+         hypre_newBoxLoopDeclare(databox1);                                                           \
+         hypre_BoxLoopIncK(1, databox1, i1);                                                          \
          hypre_BoxLoopIncK(2, databox2, i2);
 
-#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var)                                                                \
+#define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var)                                                \
       }                                                                                               \
-   }, hypre__tot, sum_buf);                                                                                    \
-}
-
-
-
-
-
-
-
-/*********************************************************************
- * HOST IMPLEMENTATION
- *********************************************************************/
-
-#ifdef HYPRE_USING_OPENMP
-#define HYPRE_BOX_REDUCTION
-#if defined(WIN32) && defined(_MSC_VER)
-#define Pragma(x) __pragma(HYPRE_XSTR(x))
-#else
-#define Pragma(x) _Pragma(HYPRE_XSTR(x))
-#endif
-#define OMP0 Pragma(omp parallel for HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE)
-#define OMP1 Pragma(omp parallel for private(HYPRE_BOX_PRIVATE) HYPRE_BOX_REDUCTION HYPRE_SMP_SCHEDULE)
-#else /* #ifdef HYPRE_USING_OPENMP */
-#define OMP0
-#define OMP1
-#endif /* #ifdef HYPRE_USING_OPENMP */
-
-#define zypre_newBoxLoop0Begin(ndim, loop_size)                               \
-{                                                                             \
-   zypre_BoxLoopDeclare();                                                    \
-   zypre_BoxLoopInit(ndim, loop_size);                                        \
-   OMP1                                                                       \
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
-   {                                                                          \
-      zypre_BoxLoopSet();                                                     \
-      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
-      {                                                                       \
-         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
-         {
-
-#define zypre_newBoxLoop0End()                                                \
-         }                                                                    \
-         zypre_BoxLoopInc1();                                                 \
-         zypre_BoxLoopInc2();                                                 \
-      }                                                                       \
-   }                                                                          \
-}
-
-#define zypre_newBoxLoop1Begin(ndim, loop_size,                               \
-                               dbox1, start1, stride1, i1)                    \
-{                                                                             \
-   HYPRE_Int i1;                                                              \
-   zypre_BoxLoopDeclare();                                                    \
-   zypre_BoxLoopDeclareK(1);                                                  \
-   zypre_BoxLoopInit(ndim, loop_size);                                        \
-   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
-   OMP1                                                                       \
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
-   {                                                                          \
-      HYPRE_Int i1;                                                           \
-      zypre_BoxLoopSet();                                                     \
-      zypre_BoxLoopSetK(1, i1);                                               \
-      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
-      {                                                                       \
-         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
-         {
-
-#define zypre_newBoxLoop1End(i1)                                              \
-            i1 += hypre__i0inc1;                                              \
-         }                                                                    \
-         zypre_BoxLoopInc1();                                                 \
-         i1 += hypre__ikinc1[hypre__d];                                       \
-         zypre_BoxLoopInc2();                                                 \
-      }                                                                       \
-   }                                                                          \
-}
-
-
-#define zypre_newBoxLoop2Begin(ndim, loop_size,                               \
-                               dbox1, start1, stride1, i1,                    \
-                               dbox2, start2, stride2, i2)                    \
-{                                                                             \
-   HYPRE_Int i1, i2;                                                          \
-   zypre_BoxLoopDeclare();                                                    \
-   zypre_BoxLoopDeclareK(1);                                                  \
-   zypre_BoxLoopDeclareK(2);                                                  \
-   zypre_BoxLoopInit(ndim, loop_size);                                        \
-   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
-   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);                         \
-   OMP1                                                                       \
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
-   {                                                                          \
-      HYPRE_Int i1, i2;                                                       \
-      zypre_BoxLoopSet();                                                     \
-      zypre_BoxLoopSetK(1, i1);                                               \
-      zypre_BoxLoopSetK(2, i2);                                               \
-      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
-      {                                                                       \
-         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
-         {
-
-#define zypre_newBoxLoop2End(i1, i2)                                          \
-            i1 += hypre__i0inc1;                                              \
-            i2 += hypre__i0inc2;                                              \
-         }                                                                    \
-         zypre_BoxLoopInc1();                                                 \
-         i1 += hypre__ikinc1[hypre__d];                                       \
-         i2 += hypre__ikinc2[hypre__d];                                       \
-         zypre_BoxLoopInc2();                                                 \
-      }                                                                       \
-   }                                                                          \
+   }, hypre__tot, sum_buf);                                                                           \
 }
 
 
-#define zypre_newBoxLoop3Begin(ndim, loop_size,                               \
-                               dbox1, start1, stride1, i1,                    \
-                               dbox2, start2, stride2, i2,                    \
-                               dbox3, start3, stride3, i3)                    \
-{                                                                             \
-   HYPRE_Int i1, i2, i3;                                                      \
-   zypre_BoxLoopDeclare();                                                    \
-   zypre_BoxLoopDeclareK(1);                                                  \
-   zypre_BoxLoopDeclareK(2);                                                  \
-   zypre_BoxLoopDeclareK(3);                                                  \
-   zypre_BoxLoopInit(ndim, loop_size);                                        \
-   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
-   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);                         \
-   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);                         \
-   OMP1                                                                       \
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
-   {                                                                          \
-      HYPRE_Int i1, i2, i3;                                                   \
-      zypre_BoxLoopSet();                                                     \
-      zypre_BoxLoopSetK(1, i1);                                               \
-      zypre_BoxLoopSetK(2, i2);                                               \
-      zypre_BoxLoopSetK(3, i3);                                               \
-      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
-      {                                                                       \
-         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
-         {
-
-#define zypre_newBoxLoop3End(i1, i2, i3)                                      \
-            i1 += hypre__i0inc1;                                              \
-            i2 += hypre__i0inc2;                                              \
-            i3 += hypre__i0inc3;                                              \
-         }                                                                    \
-         zypre_BoxLoopInc1();                                                 \
-         i1 += hypre__ikinc1[hypre__d];                                       \
-         i2 += hypre__ikinc2[hypre__d];                                       \
-         i3 += hypre__ikinc3[hypre__d];                                       \
-         zypre_BoxLoopInc2();                                                 \
-      }                                                                       \
-   }                                                                          \
-}
-
-#define zypre_newBoxLoop4Begin(ndim, loop_size,                               \
-                            dbox1, start1, stride1, i1,                       \
-                            dbox2, start2, stride2, i2,                       \
-                            dbox3, start3, stride3, i3,                       \
-                            dbox4, start4, stride4, i4)                       \
-{                                                                             \
-   HYPRE_Int i1, i2, i3, i4;                                                  \
-   zypre_BoxLoopDeclare();                                                    \
-   zypre_BoxLoopDeclareK(1);                                                  \
-   zypre_BoxLoopDeclareK(2);                                                  \
-   zypre_BoxLoopDeclareK(3);                                                  \
-   zypre_BoxLoopDeclareK(4);                                                  \
-   zypre_BoxLoopInit(ndim, loop_size);                                        \
-   zypre_BoxLoopInitK(1, dbox1, start1, stride1, i1);                         \
-   zypre_BoxLoopInitK(2, dbox2, start2, stride2, i2);                         \
-   zypre_BoxLoopInitK(3, dbox3, start3, stride3, i3);                         \
-   zypre_BoxLoopInitK(4, dbox4, start4, stride4, i4);                         \
-   OMP1                                                                       \
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
-   {                                                                          \
-      HYPRE_Int i1, i2, i3, i4;                                               \
-      zypre_BoxLoopSet();                                                     \
-      zypre_BoxLoopSetK(1, i1);                                               \
-      zypre_BoxLoopSetK(2, i2);                                               \
-      zypre_BoxLoopSetK(3, i3);                                               \
-      zypre_BoxLoopSetK(4, i4);                                               \
-      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
-      {                                                                       \
-         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
-         {
-
-#define zypre_newBoxLoop4End(i1, i2, i3, i4)                                  \
-            i1 += hypre__i0inc1;                                              \
-            i2 += hypre__i0inc2;                                              \
-            i3 += hypre__i0inc3;                                              \
-            i4 += hypre__i0inc4;                                              \
-         }                                                                    \
-         zypre_BoxLoopInc1();                                                 \
-         i1 += hypre__ikinc1[hypre__d];                                       \
-         i2 += hypre__ikinc2[hypre__d];                                       \
-         i3 += hypre__ikinc3[hypre__d];                                       \
-         i4 += hypre__ikinc4[hypre__d];                                       \
-         zypre_BoxLoopInc2();                                                 \
-      }                                                                       \
-   }                                                                          \
-}
-
-#define zypre_newBasicBoxLoop2Begin(ndim, loop_size,                          \
-                                    stride1, i1,                              \
-                                    stride2, i2)                              \
-{                                                                             \
-   zypre_BoxLoopDeclare();                                                    \
-   zypre_BoxLoopDeclareK(1);                                                  \
-   zypre_BoxLoopDeclareK(2);                                                  \
-   zypre_BoxLoopInit(ndim, loop_size);                                        \
-   zypre_BasicBoxLoopInitK(1, stride1);                                       \
-   zypre_BasicBoxLoopInitK(2, stride2);                                       \
-   OMP1                                                                       \
-   for (hypre__block = 0; hypre__block < hypre__num_blocks; hypre__block++)   \
-   {                                                                          \
-      HYPRE_Int i1, i2;                                                       \
-      zypre_BoxLoopSet();                                                     \
-      zypre_BoxLoopSetK(1, i1);                                               \
-      zypre_BoxLoopSetK(2, i2);                                               \
-      for (hypre__J = 0; hypre__J < hypre__JN; hypre__J++)                    \
-      {                                                                       \
-         for (hypre__I = 0; hypre__I < hypre__IN; hypre__I++)                 \
-         {
-
-
-#define hypre_LoopBegin(size, idx)                                            \
-{                                                                             \
-   HYPRE_Int idx;                                                             \
-   OMP0                                                                       \
-   for (idx = 0; idx < size; idx ++)                                          \
-   {
-
-#define hypre_LoopEnd()                                                       \
-  }                                                                           \
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 /*********************************************************************
  * renamings
  *********************************************************************/
@@ -1824,7 +1564,6 @@ else                                                            \
 #define hypre_BasicBoxLoop2Begin hypre_newBasicBoxLoop2Begin
 
 /* Reduction */
-/* WM: todo - using CPU version for now */
 #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \
         hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum)
 
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index 1c44ee3e08..311c235567 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -195,7 +195,6 @@ else                                                            \
 }
 
 /* Given input 1-D 'idx' in box, get 3-D 'local_idx' in loop_size */
-/* WM: todo - double check that item.get_local_id(0) is actually what you want below */
 #define hypre_newBoxLoopDeclare(box)                     \
    hypre_Index local_idx;                                \
    HYPRE_Int idx_local = idx;                            \
@@ -413,7 +412,6 @@ else                                                            \
 #define hypre_BasicBoxLoop2Begin hypre_newBasicBoxLoop2Begin
 
 /* Reduction */
-/* WM: todo - using CPU version for now */
 #define hypre_BoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum) \
         hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, reducesum)
 
diff --git a/src/test/Makefile b/src/test/Makefile
index b5910211c6..975e702290 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -139,11 +139,6 @@ ij: ij.o
 	@echo  "Building" $@ "... "
 	${LINK_CC} -o $@ $< ${LFLAGS}
 
-# WM: TODO: remove
-simple: simple.obj
-	@echo  "Building" $@ "... "
-	${LINK_CC} -o $@ $< ${LFLAGS}
-
 ij_assembly: ij_assembly.o
 	@echo  "Building" $@ "... "
 	${LINK_CC} -o $@ $< ${LFLAGS}
diff --git a/src/test/TEST_ij/solvers.jobs b/src/test/TEST_ij/solvers.jobs
index d2f69b045e..f1c37d82ca 100755
--- a/src/test/TEST_ij/solvers.jobs
+++ b/src/test/TEST_ij/solvers.jobs
@@ -29,46 +29,45 @@
 #        60: DS_FlexGMRES
 #
 #=============================================================================
-# WM: TODO remove -exec_host
 
-mpirun -np 2 ./ij -exec_host -solver 1 -rhsrand > solvers.out.0
-mpirun -np 2 ./ij -exec_host -solver 2 -rhsrand > solvers.out.1
-mpirun -np 2 ./ij -exec_host -solver 3 -rhsrand > solvers.out.2
-mpirun -np 2 ./ij -exec_host -solver 4 -rhsrand > solvers.out.3
-mpirun -np 2 ./ij -exec_host -solver 5 -rhsrand -w 0.67 -ns 2 > solvers.out.4
-mpirun -np 2 ./ij -exec_host -solver 6 -rhsrand > solvers.out.5
-#mpirun -np 2 ./ij -exec_host -solver 7 -rhsrand > solvers.out.6
-#mpirun -np 2 ./ij -exec_host -solver 8 -rhsrand > solvers.out.7
-mpirun -np 2 ./ij -exec_host -solver 20 -rhsrand > solvers.out.8
-mpirun -np 2 ./ij -exec_host -solver 20 -cf 0.5 -rhsrand > solvers.out.9
-mpirun -np 2 ./ij -exec_host -solver 20 -cf 0.5 -rhsrand -solver_type 2 > solvers.out.10
-mpirun -np 2 ./ij -exec_host -solver 20 -cf 0.5 -rhsrand -solver_type 3 > solvers.out.11
-mpirun -np 2 ./ij -exec_host -solver 16 -rhsrand > solvers.out.12
-mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand > solvers.out.13
-mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand -cgs 2 > solvers.out.14
-mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand -cgs 2 -unroll 8 > solvers.out.15
-mpirun -np 2 ./ij -exec_host -solver 17 -rhsrand -unroll 4 > solvers.out.16
-mpirun -np 2 ./ij -exec_host -solver 3 -rhsrand -check_residual > solvers.out.17
-mpirun -np 2 ./ij -exec_host -solver 4 -rhsrand -check_residual > solvers.out.18
+mpirun -np 2 ./ij -solver 1 -rhsrand > solvers.out.0
+mpirun -np 2 ./ij -solver 2 -rhsrand > solvers.out.1
+mpirun -np 2 ./ij -solver 3 -rhsrand > solvers.out.2
+mpirun -np 2 ./ij -solver 4 -rhsrand > solvers.out.3
+mpirun -np 2 ./ij -solver 5 -rhsrand -w 0.67 -ns 2 > solvers.out.4
+mpirun -np 2 ./ij -solver 6 -rhsrand > solvers.out.5
+#mpirun -np 2 ./ij -solver 7 -rhsrand > solvers.out.6
+#mpirun -np 2 ./ij -solver 8 -rhsrand > solvers.out.7
+mpirun -np 2 ./ij -solver 20 -rhsrand > solvers.out.8
+mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand > solvers.out.9
+mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand -solver_type 2 > solvers.out.10
+mpirun -np 2 ./ij -solver 20 -cf 0.5 -rhsrand -solver_type 3 > solvers.out.11
+mpirun -np 2 ./ij -solver 16 -rhsrand > solvers.out.12
+mpirun -np 2 ./ij -solver 17 -rhsrand > solvers.out.13
+mpirun -np 2 ./ij -solver 17 -rhsrand -cgs 2 > solvers.out.14
+mpirun -np 2 ./ij -solver 17 -rhsrand -cgs 2 -unroll 8 > solvers.out.15
+mpirun -np 2 ./ij -solver 17 -rhsrand -unroll 4 > solvers.out.16
+mpirun -np 2 ./ij -solver 3 -rhsrand -check_residual > solvers.out.17
+mpirun -np 2 ./ij -solver 4 -rhsrand -check_residual > solvers.out.18
 
 #systems AMG run ...unknown approach, hybrid approach, nodal approach
-mpirun -np 2 ./ij -exec_host -n 20 20 20 -sysL 2 -nf 2 > solvers.out.sysu
-mpirun -np 2 ./ij -exec_host -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 > solvers.out.sysh
-mpirun -np 2 ./ij -exec_host -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 > solvers.out.sysn
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 > solvers.out.sysu
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -nodal 1 -smtype 6 -smlv 10 -dom 1 -ov 0 > solvers.out.sysh
+mpirun -np 2 ./ij -n 20 20 20 -sysL 2 -nf 2 -interptype 10 -Pmx 6 > solvers.out.sysn
 
 #LGMRS and FlexGMRES
-mpirun -np 2 ./ij -exec_host -solver 50 -rhsrand > solvers.out.101
-mpirun -np 2 ./ij -exec_host -solver 51 -rhsrand > solvers.out.102
-mpirun -np 2 ./ij -exec_host -solver 60 -rhsrand > solvers.out.103
-mpirun -np 2 ./ij -exec_host -solver 61 -rhsrand > solvers.out.104
+mpirun -np 2 ./ij -solver 50 -rhsrand > solvers.out.101
+mpirun -np 2 ./ij -solver 51 -rhsrand > solvers.out.102
+mpirun -np 2 ./ij -solver 60 -rhsrand > solvers.out.103
+mpirun -np 2 ./ij -solver 61 -rhsrand > solvers.out.104
 
 #agglomerated coarse grid solve
-mpirun -np 8 ./ij -exec_host -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 > solvers.out.105
-mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 > solvers.out.107
+mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 > solvers.out.105
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 > solvers.out.107
 
 #redundant coarse grid solve
-mpirun -np 8 ./ij -exec_host -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 > solvers.out.106
-mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 > solvers.out.108
+mpirun -np 8 ./ij -n 80 80 80 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -red 1 > solvers.out.106
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -agg_nl 1 -seq_th 100 -solver 1 -rlx 6 -sysL 3 -nf 3 -red 1 > solvers.out.108
 
 #additive cycles
 mpirun -np 2 ./ij -n 20 20 20 -P 2 1 1 -agg_nl 1 -solver 1 -CF 0 -rlx 0 -w 0.7 -rlx_coarse 0 -ns_coarse 2 > solvers.out.109
@@ -83,12 +82,12 @@ mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -mult_add 0 -ns 2 -ra
 mpirun -np 8 ./ij -n 20 20 20 -P 2 2 2 -agg_nl 1 -solver 3 -rlx 18 -ns 2 -rlx_coarse 18 -ns_coarse 2 > solvers.out.120
 
 #nonGalerkin version
-mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114
-mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 > solvers.out.115
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 1 0.03 > solvers.out.114
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -nongalerk_tol 3 0.0 0.01 0.05 > solvers.out.115
 
 #RAP options
-mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 > solvers.out.116
-mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 0 > solvers.out.116
+mpirun -np 8 ./ij -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out.117
 
 #
 # MGR and MGR-PCG
@@ -96,26 +95,26 @@ mpirun -np 8 ./ij -exec_host -n 40 40 40 -P 2 2 2 -solver 3 -rap 1 > solvers.out
 # coarse grid solver checks (1-level MGR == AMG (or coarse grid solver))
 # Also checks for keeping coarse nodes to coarsest level
 # coarse grid size in output should be ~ mgr_num_reserved_nodes
-mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 0 > solvers.out.200
-mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 100 > solvers.out.201
-mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 0 > solvers.out.202
-mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 100 > solvers.out.203
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 0 > solvers.out.200
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 0 -mgr_bsize 2 -mgr_num_reserved_nodes 100 > solvers.out.201
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 0 > solvers.out.202
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 1 -mgr_num_reserved_nodes 100 > solvers.out.203
 # multi level MGR tests with different coarse grid type strategies
 # Fix non C points to F points with different F-relaxation methods (single/multilevel F-relaxation)
 # with/ without reserved coarse nodes
-mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.204
-mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.205
-mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.206
-mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.207
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.204
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.205
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.206
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 1 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.207
 # Not fixed non C points to F points with different F-relaxation methods (single/multilevel F-relaxation)
 # with/ without reserved coarse nodes
-mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.208
-mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.209
-mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.210
-mpirun -np 2 ./ij -exec_host -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.211
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.208
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 100 > solvers.out.209
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 0 > solvers.out.210
+mpirun -np 2 ./ij -solver 70 -mgr_nlevels 5 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 1 -mgr_num_reserved_nodes 100 > solvers.out.211
 # MGR-PCG tests
-mpirun -np 2 ./ij -exec_host -solver 71 -mgr_nlevels 0 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.212
-mpirun -np 2 ./ij -exec_host -solver 71 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.213
+mpirun -np 2 ./ij -solver 71 -mgr_nlevels 0 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.212
+mpirun -np 2 ./ij -solver 71 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_to_f 0 -mgr_frelax_method 0 -mgr_num_reserved_nodes 0 > solvers.out.213
 
 #
 # hypre_ILU tests
@@ -124,39 +123,39 @@ mpirun -np 2 ./ij -exec_host -solver 71 -mgr_nlevels 1 -mgr_bsize 2 -mgr_non_c_t
 # Tests ILU-(Flex)GMRES
 # Test AMG with ILU as a complex smoother
 #
-mpirun -np 1  ./ij -exec_host -solver 80 -ilu_type 0 -ilu_lfil 0 > solvers.out.300
-mpirun -np 1  ./ij -exec_host -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.301
-mpirun -np 1  ./ij -exec_host -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.302
+mpirun -np 1  ./ij -solver 80 -ilu_type 0 -ilu_lfil 0 > solvers.out.300
+mpirun -np 1  ./ij -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.301
+mpirun -np 1  ./ij -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.302
 # parallel ILU
 # BJ
-mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.303
-mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000  > solvers.out.304
+mpirun -np 2  ./ij -solver 80 -ilu_type 0 -ilu_lfil 1 > solvers.out.303
+mpirun -np 2  ./ij -solver 80 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000  > solvers.out.304
 # GMRES+ILU
-mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 10 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.305
-mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.306
+mpirun -np 2  ./ij -solver 80 -ilu_type 10 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.305
+mpirun -np 2  ./ij -solver 80 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.306
 # NSH+ILU
-mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 20 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.307
-mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.308
+mpirun -np 2  ./ij -solver 80 -ilu_type 20 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.307
+mpirun -np 2  ./ij -solver 80 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.308
 # RAS+ILU
-mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 30 -ilu_lfil 1 > solvers.out.309
-mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.310
+mpirun -np 2  ./ij -solver 80 -ilu_type 30 -ilu_lfil 1 > solvers.out.309
+mpirun -np 2  ./ij -solver 80 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 > solvers.out.310
 # ddPQ-GMRES+ILU
-mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 40 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.311
-mpirun -np 2  ./ij -exec_host -solver 80 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.312
+mpirun -np 2  ./ij -solver 80 -ilu_type 40 -ilu_lfil 1 -ilu_schur_max_iter 5 > solvers.out.311
+mpirun -np 2  ./ij -solver 80 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.312
 ##  ILU-GMRES
-mpirun -np 2  ./ij -exec_host -solver 81 -ilu_type 0 -ilu_lfil 0 > solvers.out.313
-mpirun -np 2  ./ij -exec_host -solver 81 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000  > solvers.out.314
-mpirun -np 2  ./ij -exec_host -solver 81 -ilu_type 30 -ilu_lfil 0 > solvers.out.315
-mpirun -np 2  ./ij -exec_host -solver 81 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000  > solvers.out.316
+mpirun -np 2  ./ij -solver 81 -ilu_type 0 -ilu_lfil 0 > solvers.out.313
+mpirun -np 2  ./ij -solver 81 -ilu_type 1 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000  > solvers.out.314
+mpirun -np 2  ./ij -solver 81 -ilu_type 30 -ilu_lfil 0 > solvers.out.315
+mpirun -np 2  ./ij -solver 81 -ilu_type 31 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000  > solvers.out.316
 ##  ILU-FlexGMRES
-mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 10 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.317
-mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.318
-mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 20 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.319
-mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.320
-mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 40 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.321
-mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.322
+mpirun -np 2  ./ij -solver 82 -ilu_type 10 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.317
+mpirun -np 2  ./ij -solver 82 -ilu_type 11 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.318
+mpirun -np 2  ./ij -solver 82 -ilu_type 20 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.319
+mpirun -np 2  ./ij -solver 82 -ilu_type 21 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.320
+mpirun -np 2  ./ij -solver 82 -ilu_type 40 -ilu_lfil 0 -ilu_schur_max_iter 5 > solvers.out.321
+mpirun -np 2  ./ij -solver 82 -ilu_type 41 -ilu_droptol 1.0e-2 -ilu_max_row_nnz 1000 -ilu_schur_max_iter 5  > solvers.out.322
 ## RAP-ILU
-mpirun -np 2  ./ij -exec_host -solver 82 -ilu_type 50 -ilu_lfil 0 > solvers.out.323
+mpirun -np 2  ./ij -solver 82 -ilu_type 50 -ilu_lfil 0 > solvers.out.323
 ## ILU smoother for AMG
 mpirun -np 2  ./ij -solver 0 -smtype 5  -smlv 1 -ilu_type 30 > solvers.out.324
 mpirun -np 2  ./ij -solver 0 -smtype 15 -smlv 1 -ilu_type 30 > solvers.out.325
diff --git a/src/test/simple.c b/src/test/simple.c
deleted file mode 100644
index ff5e40b103..0000000000
--- a/src/test/simple.c
+++ /dev/null
@@ -1,642 +0,0 @@
-/* WM: todo - remove this file from git */
-
-#include "HYPRE.h"
-#include "_hypre_struct_mv.h"
-#include "_hypre_struct_mv.hpp"
-
-HYPRE_Int AddValuesVector( hypre_StructGrid  *gridvector,
-                           hypre_StructVector *zvector,
-                           HYPRE_Int          *period,
-                           HYPRE_Real         value  )  ;
-
-
-
-
-HYPRE_Int
-cpu_hypre_StructVectorSetConstantValues( hypre_StructVector *vector,
-                                     HYPRE_Complex       values )
-{
-   hypre_Box          *v_data_box;
-
-   HYPRE_Complex      *vp;
-
-   hypre_BoxArray     *boxes;
-   hypre_Box          *box;
-   hypre_Index         loop_size;
-   hypre_IndexRef      start;
-   hypre_Index         unit_stride;
-
-   HYPRE_Int           i;
-
-   /*-----------------------------------------------------------------------
-    * Set the vector coefficients
-    *-----------------------------------------------------------------------*/
-
-   hypre_SetIndex(unit_stride, 1);
-
-   boxes = hypre_StructGridBoxes(hypre_StructVectorGrid(vector));
-   hypre_ForBoxI(i, boxes)
-   {
-      box      = hypre_BoxArrayBox(boxes, i);
-      start = hypre_BoxIMin(box);
-
-      v_data_box =
-         hypre_BoxArrayBox(hypre_StructVectorDataSpace(vector), i);
-      vp = hypre_StructVectorBoxData(vector, i);
-
-      hypre_BoxGetSize(box, loop_size);
-
-#define DEVICE_VAR is_device_ptr(vp)
-      zypre_newBoxLoop1Begin(hypre_StructVectorNDim(vector), loop_size,
-                          v_data_box, start, unit_stride, vi);
-      {
-         vp[vi] = values;
-      }
-      zypre_newBoxLoop1End(vi);
-#undef DEVICE_VAR
-   }
-
-   return hypre_error_flag;
-}
-
-HYPRE_Int
-my_hypre_StructAxpy( HYPRE_Complex       alpha,
-                     hypre_StructVector *x,
-                     hypre_StructVector *y     )
-{
-   hypre_Box        *x_data_box;
-   hypre_Box        *y_data_box;
-
-   HYPRE_Complex    *xp;
-   HYPRE_Complex    *yp;
-
-   hypre_BoxArray   *boxes;
-   hypre_Box        *box;
-   hypre_Index       loop_size;
-   hypre_IndexRef    start;
-   hypre_Index       unit_stride;
-
-   HYPRE_Int         i;
-
-   hypre_SetIndex(unit_stride, 1);
-
-   boxes = hypre_StructGridBoxes(hypre_StructVectorGrid(y));
-   hypre_ForBoxI(i, boxes)
-   {
-      box   = hypre_BoxArrayBox(boxes, i);
-      start = hypre_BoxIMin(box);
-
-      x_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(x), i);
-      y_data_box = hypre_BoxArrayBox(hypre_StructVectorDataSpace(y), i);
-
-      xp = hypre_StructVectorBoxData(x, i);
-      yp = hypre_StructVectorBoxData(y, i);
-
-      hypre_BoxGetSize(box, loop_size);
-
-#define DEVICE_VAR is_device_ptr(yp,xp)
-      /* WM: todo */
-      /* my_hypre_BoxLoop2Begin(hypre_StructVectorNDim(x), loop_size, */
-      /*                     x_data_box, start, unit_stride, xi, */
-      /*                     y_data_box, start, unit_stride, yi); */
-      /* { */
-      /*    yp[yi] += alpha * xp[xi]; */
-      /* } */
-      /* my_hypre_BoxLoop2End(xi, yi); */
-#undef DEVICE_VAR
-   }
-
-   return hypre_error_flag;
-}
-
-
-/****************************
- * show device function copied from oneAPI examples
- ****************************/
-#include <iomanip>
-#include "dpc_common.hpp"
-
-void ShowDevice(sycl::queue &q) {
-  using namespace std;
-  using namespace sycl;
-  // Output platform and device information.
-  auto device = q.get_device();
-  auto p_name = device.get_platform().get_info<info::platform::name>();
-  cout << std::setw(20) << "Platform Name: " << p_name << "\n";
-  auto p_version = device.get_platform().get_info<info::platform::version>();
-  cout << std::setw(20) << "Platform Version: " << p_version << "\n";
-  auto d_name = device.get_info<info::device::name>();
-  cout << std::setw(20) << "Device Name: " << d_name << "\n";
-  auto max_work_group = device.get_info<info::device::max_work_group_size>();
-  cout << std::setw(20) << "Max Work Group: " << max_work_group << "\n";
-  auto max_compute_units = device.get_info<info::device::max_compute_units>();
-  cout << std::setw(20) << "Max Compute Units: " << max_compute_units << "\n\n";
-}
-
-/****************************
- * main
- ****************************/
-
-hypre_int
-main( hypre_int argc,
-      char *argv[] )
-{
-   /* hypre_MPI_Init(&argc, &argv); */
-   /* HYPRE_Init(); */
-   /* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); */
-
-
-   /* return 0; */
-
-/******************************************************************************/
-/******************************************************************************/
-
-   /* Get device */
-   /* sycl::device   syclDev = sycl::device(sycl::default_selector{}); */
-
-   /* /1* Get asynchandler *1/ */
-   /* auto sycl_asynchandler = [] (sycl::exception_list exceptions) */ 
-   /* { */
-   /*    for (std::exception_ptr const& e : exceptions) */ 
-   /*    { */
-   /*       try */
-   /*       { */
-   /*          std::rethrow_exception(e); */
-   /*       } */
-   /*       catch (sycl::exception const& ex) */
-   /*       { */
-   /*          std::cout << "Caught asynchronous SYCL exception:" << std::endl */
-   /*          << ex.what() << ", OpenCL code: " << ex.get_cl_code() << std::endl; */
-   /*       } */
-   /*    } */
-   /* }; */
-
-   /* /1* Setup sycl context *1/ */
-   /* sycl::context  syclctxt  = sycl::context(syclDev, sycl_asynchandler); */
-
-   /* /1* Setup queue *1/ */
-   /* sycl::queue *my_queue = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}}); */
-
-   /* /1* Show device associated with queue *1/ */
-   /* ShowDevice(*my_queue); */
-
-   /* return 0; */
-
-
-
-/******************************************************************************/
-/******************************************************************************/
-
-    int length = 1024;
- 
-    sycl::default_selector selector;
-    sycl::queue myq(selector);
-    std::cout<<"Running on: "<<myq.get_device().get_info<sycl::info::device::name>()<<"\n";
- 
-    auto A = sycl::malloc_shared<float>(length, myq);
- 
-    auto gr = sycl::range<1>(length);
-    auto lr = sycl::range<1>(32); //change me, too small?
- 
- 
-    for(int i=0;i<length;i++) A[i] = static_cast<float>(i+1);  //initialize
- 
-    //MAKE SURE I"M HOST & DEVICE ACCESSIBLE!
-    auto fsum = sycl::malloc_shared<float>(1, myq);
- 
-    {
-    myq.submit( [&](auto &h) {
-        /* auto properties = sycl::property::reduction::initialize_to_identity{}; */
-        h.parallel_for(sycl::nd_range<1>(gr,lr),
-            sycl::ONEAPI::reduction(fsum, std::plus<>()),
-            [=](sycl::nd_item<1> it, auto &sum){
-                int i = it.get_global_id(0);
-                sum += A[i];
-            });
-    }).wait_and_throw();
-    }
- 
-    printf("sum: %f\n",fsum[0]);
-    return 0;
-
-
-/******************************************************************************/
-/******************************************************************************/
-
-
-
-
-   /* initialize */
-   /* hypre_MPI_Init(&argc, &argv); */
-   /* HYPRE_Init(); */
-   /* /1* ShowDevice(*hypre_HandleComputeStream(hypre_handle())); *1/ */
-
-   /* HYPRE_Int length = 1000; */
-   /* const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension(); */
-   /* const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim); */
-   /* HYPRE_Real *arr = hypre_CTAlloc(HYPRE_Real, length, HYPRE_MEMORY_DEVICE); */
-   /* HYPRE_Real sum_var = 0; */
-   /* /1* sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1); *1/ */
-   /* sycl::buffer<HYPRE_Real> sum_buf{&sum_var, 1}; */
-
-   /* /1* Reduction parallel_for with accessor *1/ */
-   /* std::cout << "Launching parallel_for reduction with accessor" << std::endl; */
-   /* hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) */
-   /*    { */
-   /*       sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write); */
-   /*       /1* auto sumReduction = sycl::reduction(sum_buf, cgh, sycl::plus<>()); *1/ */
-
-   /*       /1* WM: NOTE - on JLSE, ONEAPI is marked as deprecated to be replaced by ext::oneapi *1/ */
-   /*       cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), */ 
-   /*       /1* cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sumReduction, *1/ */ 
-   /*          [=] (sycl::nd_item<1> item, auto &sum) */ 
-   /*             { */
-   /*                /1* trivial kernel *1/ */ 
-   /*             }); */
-   /*    }).wait_and_throw(); */
-
-
-
-
-/*    HYPRE_Real *sum_var_usm = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE); */
-
-/*    /1* Reduction parallel_for with unified memory pointer *1/ */
-/*    std::cout << "Launching parallel_for reduction with unified memory pointer" << std::endl; */
-/*    hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) */
-/*       { */
-/*          cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_var_usm, sycl::ONEAPI::plus<>()), */ 
-/*             [=] (sycl::nd_item<1> item, auto &sum) */ 
-/*                { */
-/*                   /1* trivial kernel *1/ */ 
-/*                }); */
-/*       }).wait_and_throw(); */
-
-
-
-
-
-   /* sycl::queue my_queue(sycl::default_selector{}, dpc_common::exception_handler); */
-   /* ShowDevice(my_queue); */
-
-   /* sycl::device gpu = sycl::device(sycl::cpu_selector{}); */
-   /* sycl::device dev; */
-   /* hypre_printf("is_host = %d\n", gpu.is_host()); */
-   /* hypre_printf("is_cpu = %d\n", gpu.is_cpu()); */
-   /* hypre_printf("is_cpu = %d\n", dev.is_cpu()); */
-   /* hypre_printf("is_gpu = %d\n", gpu.is_gpu()); */
-   /* hypre_printf("DONE\n"); */
-   /* exit(0); */
-
-/******************************************************************************/
-/******************************************************************************/
-
-
-   /* variables */
-   HYPRE_Int           i, ix, iy, iz, ib;
-   HYPRE_Int           p, q, r;
-   HYPRE_Int           nx, ny, nz;
-   HYPRE_Int           bx, by, bz;
-   HYPRE_Int           nblocks;
-   HYPRE_Int           dim;
-   HYPRE_Int           sym;
-   HYPRE_Int         **offsets;
-   HYPRE_Int         **iupper;
-   HYPRE_Int         **ilower;
-   HYPRE_Int           periodic[3];
-   HYPRE_Int           istart[3];
-   HYPRE_StructGrid    grid;
-   HYPRE_StructVector  b;
-   HYPRE_StructVector  x;
-   HYPRE_Int           num_ghost[6]   = {0, 0, 0, 0, 0, 0};
-
-   dim = 3;
-   sym  = 1;
-   nx = 10;
-   ny = 10;
-   nz = 10;
-   bx = 1;
-   by = 1;
-   bz = 1;
-   p = 1;
-   q = 1;
-   r = 1;
-   periodic[0] = 0;
-   periodic[1] = 0;
-   periodic[2] = 0;
-   istart[0] = -3;
-   istart[1] = -3;
-   istart[2] = -3;
-
-   for (i = 0; i < 2*dim; i++)
-   {
-      num_ghost[i]   = 1;
-   }
-
-   switch (dim)
-   {
-      case 1:
-         nblocks = bx;
-         if(sym)
-         {
-            offsets = hypre_CTAlloc(HYPRE_Int*,  2, HYPRE_MEMORY_HOST);
-            offsets[0] = hypre_CTAlloc(HYPRE_Int,  1, HYPRE_MEMORY_HOST);
-            offsets[0][0] = -1;
-            offsets[1] = hypre_CTAlloc(HYPRE_Int,  1, HYPRE_MEMORY_HOST);
-            offsets[1][0] = 0;
-         }
-         else
-         {
-            offsets = hypre_CTAlloc(HYPRE_Int*,  3, HYPRE_MEMORY_HOST);
-            offsets[0] = hypre_CTAlloc(HYPRE_Int,  1, HYPRE_MEMORY_HOST);
-            offsets[0][0] = -1;
-            offsets[1] = hypre_CTAlloc(HYPRE_Int,  1, HYPRE_MEMORY_HOST);
-            offsets[1][0] = 0;
-            offsets[2] = hypre_CTAlloc(HYPRE_Int,  1, HYPRE_MEMORY_HOST);
-            offsets[2][0] = 1;
-         }
-         break;
-
-      case 2:
-         nblocks = bx*by;
-         if(sym)
-         {
-            offsets = hypre_CTAlloc(HYPRE_Int*,  3, HYPRE_MEMORY_HOST);
-            offsets[0] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
-            offsets[0][0] = -1;
-            offsets[0][1] = 0;
-            offsets[1] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
-            offsets[1][0] = 0;
-            offsets[1][1] = -1;
-            offsets[2] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
-            offsets[2][0] = 0;
-            offsets[2][1] = 0;
-         }
-         else
-         {
-            offsets = hypre_CTAlloc(HYPRE_Int*,  5, HYPRE_MEMORY_HOST);
-            offsets[0] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
-            offsets[0][0] = -1;
-            offsets[0][1] = 0;
-            offsets[1] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
-            offsets[1][0] = 0;
-            offsets[1][1] = -1;
-            offsets[2] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
-            offsets[2][0] = 0;
-            offsets[2][1] = 0;
-            offsets[3] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
-            offsets[3][0] = 1;
-            offsets[3][1] = 0;
-            offsets[4] = hypre_CTAlloc(HYPRE_Int,  2, HYPRE_MEMORY_HOST);
-            offsets[4][0] = 0;
-            offsets[4][1] = 1;
-         }
-         break;
-
-      case 3:
-         nblocks = bx*by*bz;
-         if(sym)
-         {
-            offsets = hypre_CTAlloc(HYPRE_Int*,  4, HYPRE_MEMORY_HOST);
-            offsets[0] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
-            offsets[0][0] = -1;
-            offsets[0][1] = 0;
-            offsets[0][2] = 0;
-            offsets[1] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
-            offsets[1][0] = 0;
-            offsets[1][1] = -1;
-            offsets[1][2] = 0;
-            offsets[2] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
-            offsets[2][0] = 0;
-            offsets[2][1] = 0;
-            offsets[2][2] = -1;
-            offsets[3] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
-            offsets[3][0] = 0;
-            offsets[3][1] = 0;
-            offsets[3][2] = 0;
-         }
-         else
-         {
-            offsets = hypre_CTAlloc(HYPRE_Int*,  7, HYPRE_MEMORY_HOST);
-            offsets[0] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
-            offsets[0][0] = -1;
-            offsets[0][1] = 0;
-            offsets[0][2] = 0;
-            offsets[1] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
-            offsets[1][0] = 0;
-            offsets[1][1] = -1;
-            offsets[1][2] = 0;
-            offsets[2] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
-            offsets[2][0] = 0;
-            offsets[2][1] = 0;
-            offsets[2][2] = -1;
-            offsets[3] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
-            offsets[3][0] = 0;
-            offsets[3][1] = 0;
-            offsets[3][2] = 0;
-            offsets[4] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
-            offsets[4][0] = 1;
-            offsets[4][1] = 0;
-            offsets[4][2] = 0;
-            offsets[5] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
-            offsets[5][0] = 0;
-            offsets[5][1] = 1;
-            offsets[5][2] = 0;
-            offsets[6] = hypre_CTAlloc(HYPRE_Int,  3, HYPRE_MEMORY_HOST);
-            offsets[6][0] = 0;
-            offsets[6][1] = 0;
-            offsets[6][2] = 1;
-         }
-         break;
-   }
-
-
-
-   /* initialize */
-   hypre_MPI_Init(&argc, &argv);
-   HYPRE_Init();
-
-    /* prepare space for the extents */
-   ilower = hypre_CTAlloc(HYPRE_Int*,  nblocks, HYPRE_MEMORY_HOST);
-   iupper = hypre_CTAlloc(HYPRE_Int*,  nblocks, HYPRE_MEMORY_HOST);
-   for (i = 0; i < nblocks; i++)
-   {
-      ilower[i] = hypre_CTAlloc(HYPRE_Int,  dim, HYPRE_MEMORY_HOST);
-      iupper[i] = hypre_CTAlloc(HYPRE_Int,  dim, HYPRE_MEMORY_HOST);
-   }
-
-   /* compute ilower and iupper from (p,q,r), (bx,by,bz), and (nx,ny,nz) */
-   ib = 0;
-   switch (dim)
-   {
-      case 1:
-         for (ix = 0; ix < bx; ix++)
-         {
-            ilower[ib][0] = istart[0]+ nx*(bx*p+ix);
-            iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1;
-            ib++;
-         }
-         break;
-      case 2:
-         for (iy = 0; iy < by; iy++)
-            for (ix = 0; ix < bx; ix++)
-            {
-               ilower[ib][0] = istart[0]+ nx*(bx*p+ix);
-               iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1;
-               ilower[ib][1] = istart[1]+ ny*(by*q+iy);
-               iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1;
-               ib++;
-            }
-         break;
-      case 3:
-         for (iz = 0; iz < bz; iz++)
-            for (iy = 0; iy < by; iy++)
-               for (ix = 0; ix < bx; ix++)
-               {
-                  ilower[ib][0] = istart[0]+ nx*(bx*p+ix);
-                  iupper[ib][0] = istart[0]+ nx*(bx*p+ix+1) - 1;
-                  ilower[ib][1] = istart[1]+ ny*(by*q+iy);
-                  iupper[ib][1] = istart[1]+ ny*(by*q+iy+1) - 1;
-                  ilower[ib][2] = istart[2]+ nz*(bz*r+iz);
-                  iupper[ib][2] = istart[2]+ nz*(bz*r+iz+1) - 1;
-                  ib++;
-               }
-         break;
-   }
-   /* create grid */
-   HYPRE_StructGridCreate(hypre_MPI_COMM_WORLD, dim, &grid);
-   for (ib = 0; ib < nblocks; ib++)
-   {
-      /* Add to the grid a new box defined by ilower[ib], iupper[ib]...*/
-      HYPRE_StructGridSetExtents(grid, ilower[ib], iupper[ib]);
-   }
-   HYPRE_StructGridSetPeriodic(grid, periodic);
-   HYPRE_StructGridSetNumGhost(grid, num_ghost);
-   HYPRE_StructGridAssemble(grid);
-
-   /* create struct vectors */
-   HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &b);
-   HYPRE_StructVectorInitialize(b);
-   AddValuesVector(grid,b,periodic,1.0);
-   HYPRE_StructVectorAssemble(b);
-
-   HYPRE_StructVectorCreate(hypre_MPI_COMM_WORLD, grid, &x);
-   HYPRE_StructVectorInitialize(x);
-   AddValuesVector(grid,x,periodic,1.0);
-   HYPRE_StructVectorAssemble(x);
-
-   hypre_StructVector *y = hypre_StructVectorClone(x);
-   hypre_StructVectorPrint("before", x, 1);
-
-   /* call set const */
-   cpu_hypre_StructVectorSetConstantValues(y, 5.0);
-   hypre_printf("my_hypre_StructVectorSetConstantValues() success!\n");
-
-   hypre_StructVectorPrint("after_cpu", y, 1);
-
-   hypre_StructVectorSetConstantValues(x, 5.0);
-   hypre_printf("hypre_StructVectorSetConstantValues() success!\n");
-
-   hypre_StructVectorPrint("after_gpu", x, 1);
-
-   /* call axpy */
-   /* my_hypre_StructAxpy(1.0, x, b); */
-
-
-
-
-
-
-
-   hypre_printf("DONE\n");
-   return 0;
-}
-
-HYPRE_Int
-AddValuesVector( hypre_StructGrid  *gridvector,
-                 hypre_StructVector *zvector,
-                 HYPRE_Int          *period,
-                 HYPRE_Real         value  )
-{
-/* #include  "_hypre_struct_mv.h" */
-   HYPRE_Int ierr = 0;
-   hypre_BoxArray     *gridboxes;
-   HYPRE_Int          ib;
-   hypre_IndexRef     ilower;
-   hypre_IndexRef     iupper;
-   hypre_Box          *box;
-   HYPRE_Real         *values;
-   HYPRE_Int          volume,dim;
-#if 0 //defined(HYPRE_USING_CUDA)
-   HYPRE_Int          data_location = hypre_StructGridDataLocation(hypre_StructVectorGrid(zvector));
-#endif
-
-   gridboxes =  hypre_StructGridBoxes(gridvector);
-   dim       =  hypre_StructGridNDim(gridvector);
-
-   ib=0;
-   hypre_ForBoxI(ib, gridboxes)
-   {
-      box      = hypre_BoxArrayBox(gridboxes, ib);
-      volume   =  hypre_BoxVolume(box);
-#if 0 //defined(HYPRE_USING_CUDA)
-      if (data_location != HYPRE_MEMORY_HOST)
-      {
-         values   = hypre_CTAlloc(HYPRE_Real, volume,HYPRE_MEMORY_DEVICE);
-      }
-      else
-      {
-         values   = hypre_CTAlloc(HYPRE_Real, volume,HYPRE_MEMORY_HOST);
-      }
-#else
-      values   = hypre_CTAlloc(HYPRE_Real, volume,HYPRE_MEMORY_DEVICE);
-#endif
-      /*-----------------------------------------------------------
-       * For periodic b.c. in all directions, need rhs to satisfy
-       * compatibility condition. Achieved by setting a source and
-       *  sink of equal strength.  All other problems have rhs = 1.
-       *-----------------------------------------------------------*/
-
-#define DEVICE_VAR is_device_ptr(values)
-      if ((dim == 2 && period[0] != 0 && period[1] != 0) ||
-          (dim == 3 && period[0] != 0 && period[1] != 0 && period[2] != 0))
-      {
-         hypre_LoopBegin(volume,i)
-         {
-            values[i] = 0.0;
-            values[0]         =  value;
-            values[volume - 1] = -value;
-
-         }
-         hypre_LoopEnd()
-      }
-      else
-      {
-         hypre_LoopBegin(volume,i)
-         {
-            values[i] = value;
-         }
-         hypre_LoopEnd()
-      }
-#undef DEVICE_VAR
-
-      ilower = hypre_BoxIMin(box);
-      iupper = hypre_BoxIMax(box);
-
-      HYPRE_StructVectorSetBoxValues(zvector, ilower, iupper, values);
-
-#if 0 //defined(HYPRE_USING_CUDA)
-      if (data_location != HYPRE_MEMORY_HOST)
-      {
-          hypre_TFree(values,HYPRE_MEMORY_DEVICE);
-      }
-      else
-      {
-          hypre_TFree(values,HYPRE_MEMORY_HOST);
-      }
-#else
-      hypre_TFree(values,HYPRE_MEMORY_DEVICE);
-#endif
-   }
-
-   return ierr;
-}
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index 0437d65175..3ff5aab39b 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -973,11 +973,9 @@ hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i)
          }
       };
 
-      /* WM: having trouble with getting the device on frank, so temporarily just passing the default selector */
       sycl::device   syclDev   = data->device;
       sycl::context  syclctxt  = sycl::context(syclDev, sycl_asynchandler);
       stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}});
-      /* stream = new sycl::queue(sycl::default_selector{}, sycl::property_list{sycl::property::queue::in_order{}}); */
       data->streams[i] = stream;
    }
 #endif
@@ -1234,7 +1232,7 @@ hypre_DeviceDataCreate()
    hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST);
 
 #if defined(HYPRE_USING_SYCL)
-   /* WM: does the default selector get a GPU if available? */
+   /* WM: does the default selector get a GPU if available? Having trouble with getting the device on frank, so temporarily just passing the default selector */
    hypre_DeviceDataDevice(data)            = sycl::device(sycl::default_selector{});
 #else
    hypre_DeviceDataDevice(data)            = 0;
@@ -1491,8 +1489,7 @@ hypre_bind_device( HYPRE_Int myid,
    hypre_MPI_Comm_free(&node_comm);
 
    /* get number of devices on this node */
-   /* WM: doesn't work on frank... commenting out */
-   /* hypre_GetDeviceCount(&nDevices); */
+   hypre_GetDeviceCount(&nDevices);
    nDevices = 1;
 
    /* set device */
diff --git a/src/utilities/general.c b/src/utilities/general.c
index 2f332dcac8..8ec1e818e1 100644
--- a/src/utilities/general.c
+++ b/src/utilities/general.c
@@ -71,7 +71,7 @@ hypre_HandleDestroy(hypre_Handle *hypre_handle_)
    hypre_DeviceDataDestroy(hypre_HandleDeviceData(hypre_handle_));
 #endif
 
-// WM: in debug mode, hypre_TFree() checks the pointer location, which requires the
+// In debug mode, hypre_TFree() checks the pointer location, which requires the
 // hypre_handle_'s compute queue if using sycl. But this was just destroyed above.
 #if defined(HYPRE_DEBUG) && defined(HYPRE_USING_SYCL)
    free(hypre_handle_);
@@ -152,25 +152,25 @@ hypre_GetDeviceCount(hypre_int *device_count)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   // WM: TODO - verify
-   sycl::platform platform(sycl::gpu_selector{});
-   auto const& gpu_devices = platform.get_devices();
-   for (int i = 0; i < gpu_devices.size(); i++)
-   {
-      if (gpu_devices[i].is_gpu())
-      {
-         if(gpu_devices[i].get_info<sycl::info::device::partition_max_sub_devices>() > 0)
-         {
-            auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(
-                                        sycl::info::partition_affinity_domain::numa);
-            (*device_count) += subDevicesDomainNuma.size();
-         }
-         else
-         {
-	         (*device_count)++;
-         }
-      }
-   }
+   /* WM: todo - doesn't work on frank... commenting out */
+   /* sycl::platform platform(sycl::gpu_selector{}); */
+   /* auto const& gpu_devices = platform.get_devices(); */
+   /* for (int i = 0; i < gpu_devices.size(); i++) */
+   /* { */
+   /*    if (gpu_devices[i].is_gpu()) */
+   /*    { */
+   /*       if(gpu_devices[i].get_info<sycl::info::device::partition_max_sub_devices>() > 0) */
+   /*       { */
+   /*          auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>( */
+   /*                                      sycl::info::partition_affinity_domain::numa); */
+   /*          (*device_count) += subDevicesDomainNuma.size(); */
+   /*       } */
+   /*       else */
+   /*       { */
+	         /* (*device_count)++; */
+   /*       } */
+   /*    } */
+   /* } */
 #endif
 
    return hypre_error_flag;
diff --git a/src/utilities/memory.c b/src/utilities/memory.c
index 37248e3033..a9941e1917 100644
--- a/src/utilities/memory.c
+++ b/src/utilities/memory.c
@@ -109,7 +109,6 @@ hypre_UnifiedMemset(void *ptr, HYPRE_Int value, size_t num)
 static inline void
 hypre_UnifiedMemPrefetch(void *ptr, size_t size, hypre_MemoryLocation location)
 {
-   /* hypre_printf("WM: debug - inside UnifiedMemPrefetch\n"); */
 #if defined(HYPRE_USING_GPU)
 #ifdef HYPRE_DEBUG
    hypre_MemoryLocation tmp;
@@ -252,7 +251,6 @@ hypre_DeviceMalloc(size_t size, HYPRE_Int zeroinit)
 static inline void *
 hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit)
 {
-   /* hypre_printf("WM: debug - inside UnifiedMalloc\n"); */
    void *ptr = NULL;
 
 #if defined(HYPRE_USING_UMPIRE_UM)
@@ -277,7 +275,6 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit)
 
 #if defined(HYPRE_USING_SYCL)
    HYPRE_SYCL_CALL( ptr = (void *)sycl::malloc_shared(size, *(hypre_HandleComputeStream(hypre_handle()))) );
-   /* hypre_printf("WM: debug - did the sycl shared allocation\n"); */
 #endif
 
 #endif /* #if defined(HYPRE_USING_UMPIRE_UM) */
@@ -285,7 +282,6 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit)
    /* prefecth to device */
    if (ptr)
    {
-      /* hypre_printf("WM: debug - about to prefetch\n"); */
       hypre_UnifiedMemPrefetch(ptr, size, hypre_MEMORY_DEVICE);
    }
 
@@ -987,7 +983,6 @@ hypre_GetExecPolicy2(HYPRE_MemoryLocation location1,
 HYPRE_Int
 hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
 {
-   /* hypre_printf("WM: debug - inside GetPointerLocation\n"); */
    HYPRE_Int ierr = 0;
 
 #if defined(HYPRE_USING_GPU)
@@ -1090,7 +1085,6 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
    sycl::usm::alloc allocType;
    allocType = sycl::get_pointer_type(ptr, (hypre_HandleComputeStream(hypre_handle()))->get_context());
 
-   /* hypre_printf("WM: debug - checking allocType\n"); */
    if (allocType == sycl::usm::alloc::unknown)
    {
       *memory_location = hypre_MEMORY_HOST;
@@ -1106,7 +1100,6 @@ hypre_GetPointerLocation(const void *ptr, hypre_MemoryLocation *memory_location)
    else if (allocType == sycl::usm::alloc::shared)
    {
       *memory_location = hypre_MEMORY_UNIFIED;
-      /* hypre_printf("WM: debug - IS UNIFIED MEMORY\n"); */
    }
 #endif //HYPRE_USING_SYCL
 

From 4e54d486037b0e5ec00e452d0722ada7dcd2c796 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Tue, 5 Oct 2021 17:29:04 -0700
Subject: [PATCH 16/44] Added hypreLoopBegin/End

---
 src/struct_mv/_hypre_struct_mv.hpp | 38 ++++++++++++++++++++----------
 src/struct_mv/boxloop_sycl.h       | 38 ++++++++++++++++++++----------
 2 files changed, 52 insertions(+), 24 deletions(-)

diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index e4824ec744..8c111ed729 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1189,8 +1189,8 @@ extern "C++" {
 
 template<typename LOOP_BODY>
 void
-BoxLoopforall( LOOP_BODY loop_body,
-               HYPRE_Int length )
+BoxLoopforall( HYPRE_Int length,
+               LOOP_BODY loop_body)
 {
    /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
    /* WM: TODO: uncomment above and remove below */
@@ -1383,7 +1383,7 @@ else                                                            \
 {                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -1393,7 +1393,7 @@ else                                                            \
 
 #define hypre_newBoxLoop1End(i1)                                                                      \
       }                                                                                               \
-   }, hypre__tot);                                                                                    \
+   });                                                                                                \
 }
 
 /* BoxLoop 2 */
@@ -1403,7 +1403,7 @@ else                                                            \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
    hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -1414,7 +1414,7 @@ else                                                            \
 
 #define hypre_newBoxLoop2End(i1, i2)                                                                  \
       }                                                                                               \
-   }, hypre__tot);                                                                                    \
+   });                                                                                                \
 }
 
 /* BoxLoop 3 */
@@ -1426,7 +1426,7 @@ else                                                            \
    hypre_BoxLoopDataDeclareK(1, ndim,loop_size, dbox1, start1, stride1);                              \
    hypre_BoxLoopDataDeclareK(2, ndim,loop_size, dbox2, start2, stride2);                              \
    hypre_BoxLoopDataDeclareK(3, ndim,loop_size, dbox3, start3, stride3);                              \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -1438,7 +1438,7 @@ else                                                            \
 
 #define hypre_newBoxLoop3End(i1, i2, i3)                                                              \
       }                                                                                               \
-   }, hypre__tot);                                                                                    \
+   });                                                                                                \
 }
 
 /* BoxLoop 4 */
@@ -1452,7 +1452,7 @@ else                                                            \
    hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
    hypre_BoxLoopDataDeclareK(3, ndim, loop_size, dbox3, start3, stride3);                             \
    hypre_BoxLoopDataDeclareK(4, ndim, loop_size, dbox4, start4, stride4);                             \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -1465,7 +1465,7 @@ else                                                            \
 
 #define hypre_newBoxLoop4End(i1, i2, i3, i4)                                                          \
       }                                                                                               \
-   }, hypre__tot);                                                                                    \
+   });                                                                                                \
 }
 
 
@@ -1475,7 +1475,7 @@ else                                                            \
 {                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1);                                       \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -1489,7 +1489,7 @@ else                                                            \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1);                                       \
    hypre_BasicBoxLoopDataDeclareK(2, ndim, loop_size, stride2);                                       \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -1544,6 +1544,20 @@ else                                                            \
    }, hypre__tot, sum_buf);                                                                           \
 }
 
+/* Plain parallel_for loop */
+#define hypre_LoopBegin(size, idx)                                                                    \
+{                                                                                                     \
+   BoxLoopforall(size, [=] (sycl::nd_item<1> item)                                                    \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
+      if (idx < size)                                                                                 \
+      {                                                                                               \
+
+#define hypre_LoopEnd()                                                                               \
+      }                                                                                               \
+   });                                                                                          \
+}
+
 
 /*********************************************************************
  * renamings
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index 311c235567..dd8f910562 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -37,8 +37,8 @@ extern "C++" {
 
 template<typename LOOP_BODY>
 void
-BoxLoopforall( LOOP_BODY loop_body,
-               HYPRE_Int length )
+BoxLoopforall( HYPRE_Int length,
+               LOOP_BODY loop_body)
 {
    /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
    /* WM: TODO: uncomment above and remove below */
@@ -231,7 +231,7 @@ else                                                            \
 {                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -241,7 +241,7 @@ else                                                            \
 
 #define hypre_newBoxLoop1End(i1)                                                                      \
       }                                                                                               \
-   }, hypre__tot);                                                                                    \
+   });                                                                                                \
 }
 
 /* BoxLoop 2 */
@@ -251,7 +251,7 @@ else                                                            \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
    hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -262,7 +262,7 @@ else                                                            \
 
 #define hypre_newBoxLoop2End(i1, i2)                                                                  \
       }                                                                                               \
-   }, hypre__tot);                                                                                    \
+   });                                                                                                \
 }
 
 /* BoxLoop 3 */
@@ -274,7 +274,7 @@ else                                                            \
    hypre_BoxLoopDataDeclareK(1, ndim,loop_size, dbox1, start1, stride1);                              \
    hypre_BoxLoopDataDeclareK(2, ndim,loop_size, dbox2, start2, stride2);                              \
    hypre_BoxLoopDataDeclareK(3, ndim,loop_size, dbox3, start3, stride3);                              \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -286,7 +286,7 @@ else                                                            \
 
 #define hypre_newBoxLoop3End(i1, i2, i3)                                                              \
       }                                                                                               \
-   }, hypre__tot);                                                                                    \
+   });                                                                                                \
 }
 
 /* BoxLoop 4 */
@@ -300,7 +300,7 @@ else                                                            \
    hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
    hypre_BoxLoopDataDeclareK(3, ndim, loop_size, dbox3, start3, stride3);                             \
    hypre_BoxLoopDataDeclareK(4, ndim, loop_size, dbox4, start4, stride4);                             \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -313,7 +313,7 @@ else                                                            \
 
 #define hypre_newBoxLoop4End(i1, i2, i3, i4)                                                          \
       }                                                                                               \
-   }, hypre__tot);                                                                                    \
+   });                                                                                                \
 }
 
 
@@ -323,7 +323,7 @@ else                                                            \
 {                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1);                                       \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -337,7 +337,7 @@ else                                                            \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BasicBoxLoopDataDeclareK(1, ndim, loop_size, stride1);                                       \
    hypre_BasicBoxLoopDataDeclareK(2, ndim, loop_size, stride2);                                       \
-   BoxLoopforall( [=] (sycl::nd_item<1> item)                                                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -392,6 +392,20 @@ else                                                            \
    }, hypre__tot, sum_buf);                                                                           \
 }
 
+/* Plain parallel_for loop */
+#define hypre_LoopBegin(size, idx)                                                                    \
+{                                                                                                     \
+   BoxLoopforall(size, [=] (sycl::nd_item<1> item)                                                    \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
+      if (idx < size)                                                                                 \
+      {                                                                                               \
+
+#define hypre_LoopEnd()                                                                               \
+      }                                                                                               \
+   });                                                                                          \
+}
+
 
 /*********************************************************************
  * renamings

From a127622baec600b5b2545188211f6afffa35f793 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Mon, 18 Oct 2021 18:17:42 -0700
Subject: [PATCH 17/44] Bug fix

---
 src/struct_mv/_hypre_struct_mv.hpp | 5 +++--
 src/struct_mv/boxloop_sycl.h       | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index 8c111ed729..41c5eb4f51 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1547,10 +1547,11 @@ else                                                            \
 /* Plain parallel_for loop */
 #define hypre_LoopBegin(size, idx)                                                                    \
 {                                                                                                     \
-   BoxLoopforall(size, [=] (sycl::nd_item<1> item)                                                    \
+   HYPRE_Int hypre__tot = size;                                                                       \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
-      if (idx < size)                                                                                 \
+      if (idx < hypre__tot)                                                                           \
       {                                                                                               \
 
 #define hypre_LoopEnd()                                                                               \
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index dd8f910562..a8812b70a5 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -395,10 +395,11 @@ else                                                            \
 /* Plain parallel_for loop */
 #define hypre_LoopBegin(size, idx)                                                                    \
 {                                                                                                     \
-   BoxLoopforall(size, [=] (sycl::nd_item<1> item)                                                    \
+   HYPRE_Int hypre__tot = size;                                                                       \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
-      if (idx < size)                                                                                 \
+      if (idx < hypre__tot)                                                                           \
       {                                                                                               \
 
 #define hypre_LoopEnd()                                                                               \

From 94a269d5a2699cad6bc8ddb6fe7316fa7f150216 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Mon, 25 Oct 2021 22:37:43 +0000
Subject: [PATCH 18/44] Fix configuration options for non-unified memory

---
 src/configure                | 17 ++++++++++++++++-
 src/utilities/device_utils.c |  5 ++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/configure b/src/configure
index bb48dbdf9b..c7f941f40e 100755
--- a/src/configure
+++ b/src/configure
@@ -4910,6 +4910,21 @@ then
    as_fn_error $? "--with-hip and --with-device-openmp are mutually exclusive" "$LINENO" 5
 fi
 
+if test "x$hypre_using_cuda" = "xyes" && test "x$hypre_using_sycl" = "xyes"
+then
+   as_fn_error $? "--with-cuda and --with-sycl are mutually exclusive" "$LINENO" 5
+fi
+
+if test "x$hypre_using_hip" = "xyes" && test "x$hypre_using_sycl" = "xyes"
+then
+   as_fn_error $? "--with-hip and --with-sycl are mutually exclusive" "$LINENO" 5
+fi
+
+if test "x$hypre_using_device_openmp" = "xyes" && test "x$hypre_using_sycl" = "xyes"
+then
+   as_fn_error $? "--with-device-openmp and --with-sycl are mutually exclusive" "$LINENO" 5
+fi
+
 
 if test "$hypre_user_chose_cudacompilers" = "no"
 then
@@ -9315,7 +9330,7 @@ then
 $as_echo "#define HYPRE_USING_UNIFIED_MEMORY 1" >>confdefs.h
 
 else
-   if test "x$hypre_using_cuda" = "xyes" || test "x$hypre_using_device_openmp" = "xyes" || test "x$hypre_using_hip" = "xyes"
+   if test "x$hypre_using_cuda" = "xyes" || test "x$hypre_using_device_openmp" = "xyes" || test "x$hypre_using_hip" = "xyes" || test "x$hypre_using_sycl" = "xyes"
    then
 
 $as_echo "#define HYPRE_USING_DEVICE_MEMORY 1" >>confdefs.h
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index 3ff5aab39b..e803495e9d 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -9,12 +9,11 @@
 #include "_hypre_utilities.hpp"
 
 #if defined(HYPRE_USING_SYCL)
-// WM: TODO: verify
 sycl::range<1> hypre_GetDefaultCUDABlockDimension()
 {
   // 256 - max work group size for Gen9
-  // 512 - max work group size for ATS
-  sycl::range<1> wgDim(64);
+  // 1024 - max work group size for ATS
+  sycl::range<1> wgDim(1024);
   return wgDim;
 }
 

From 39fbd2db4b34157481f357a953e7bdfaccd7f77a Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Tue, 26 Oct 2021 16:29:02 +0000
Subject: [PATCH 19/44] Update oneapi reduction

---
 src/struct_mv/_hypre_struct_mv.hpp | 2 +-
 src/struct_mv/boxloop_sycl.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index 41c5eb4f51..e48daf8bf2 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1253,7 +1253,7 @@ ReductionBoxLoopforall( LOOP_BODY  loop_body,
       hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
          {
             sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write);
-            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), loop_body);
+            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ext::oneapi::reduction(sum_acc, std::plus<>()), loop_body);
          }).wait_and_throw();
    }
 }
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index a8812b70a5..02c90e6331 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -101,7 +101,7 @@ ReductionBoxLoopforall( LOOP_BODY  loop_body,
       hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
          {
             sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write);
-            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ONEAPI::reduction(sum_acc, sycl::ONEAPI::plus<>()), loop_body);
+            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ext::oneapi::reduction(sum_acc, std::plus<>()), loop_body);
          }).wait_and_throw();
    }
 }

From 193ee25d224fb67ae5d94a812025e46ca0d61e2d Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Tue, 26 Oct 2021 17:12:34 +0000
Subject: [PATCH 20/44] Bug fix in parallel

---
 src/struct_mv/struct_communication.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/struct_mv/struct_communication.c b/src/struct_mv/struct_communication.c
index 81be1bb7ba..cb321ee9f7 100644
--- a/src/struct_mv/struct_communication.c
+++ b/src/struct_mv/struct_communication.c
@@ -846,7 +846,7 @@ hypre_InitializeCommunication( hypre_CommPkg     *comm_pkg,
 #if defined(HYPRE_USING_GPU)
 #if defined(HYPRE_USING_RAJA) || defined(HYPRE_USING_KOKKOS)
    alloc_dev_buffer = 1;
-#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
    alloc_dev_buffer = (hypre_HandleStructExecPolicy(hypre_handle()) == HYPRE_EXEC_DEVICE);
 #elif defined(HYPRE_USING_DEVICE_OPENMP)
    alloc_dev_buffer = hypre__global_offload;

From 9166c167718cff2006206ddf7a353282da9c9ea7 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Tue, 26 Oct 2021 18:19:07 +0000
Subject: [PATCH 21/44] Additional macro fixes and implementation of redblack
 relax

---
 src/struct_ls/red_black_gs.h         | 59 ++++++++++++++++++++++++++++
 src/struct_mv/_hypre_struct_mv.h     |  2 +-
 src/struct_mv/box.h                  |  2 +-
 src/struct_mv/struct_communication.c |  2 +-
 4 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/src/struct_ls/red_black_gs.h b/src/struct_ls/red_black_gs.h
index afd8eabf7b..f1f95d864f 100644
--- a/src/struct_ls/red_black_gs.h
+++ b/src/struct_ls/red_black_gs.h
@@ -209,6 +209,65 @@ typedef struct
    });                                                              \
 }
 
+#elif defined(HYPRE_USING_SYCL)
+
+#define hypre_RedBlackLoopInit()
+#define hypre_RedBlackLoopBegin(ni,nj,nk,redblack,                  \
+                                Astart,Ani,Anj,Ai,                  \
+                                bstart,bni,bnj,bi,                  \
+                                xstart,xni,xnj,xi)                  \
+{                                                                   \
+   HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)            \
+   {                                                                \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();      \
+      HYPRE_Int idx_local = idx;                                    \
+      HYPRE_Int ii,jj,kk,Ai,bi,xi;                                  \
+      HYPRE_Int local_ii;                                           \
+      kk = idx_local % nk;                                          \
+      idx_local = idx_local / nk;                                   \
+      jj = idx_local % nj;                                          \
+      idx_local = idx_local / nj;                                   \
+      local_ii = (kk + jj + redblack) % 2;                          \
+      ii = 2*idx_local + local_ii;                                  \
+      if (ii < ni)                                                  \
+      {                                                             \
+         Ai = Astart + kk*Anj*Ani + jj*Ani + ii;                    \
+         bi = bstart + kk*bnj*bni + jj*bni + ii;                    \
+         xi = xstart + kk*xnj*xni + jj*xni + ii;                    \
+
+#define hypre_RedBlackLoopEnd()                                     \
+      }                                                             \
+   });                                                              \
+}
+
+#define hypre_RedBlackConstantcoefLoopBegin(ni,nj,nk,redblack,      \
+                                            bstart,bni,bnj,bi,      \
+                                            xstart,xni,xnj,xi)      \
+{                                                                   \
+   HYPRE_Int hypre__tot = nk*nj*((ni+1)/2);                         \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)            \
+   {                                                                \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();      \
+      HYPRE_Int idx_local = idx;                                    \
+      HYPRE_Int ii,jj,kk,bi,xi;                                     \
+      HYPRE_Int local_ii;                                           \
+      kk = idx_local % nk;                                          \
+      idx_local = idx_local / nk;                                   \
+      jj = idx_local % nj;                                          \
+      idx_local = idx_local / nj;                                   \
+      local_ii = (kk + jj + redblack) % 2;                          \
+      ii = 2*idx_local + local_ii;                                  \
+      if (ii < ni)                                                  \
+      {                                                             \
+         bi = bstart + kk*bnj*bni + jj*bni + ii;                    \
+         xi = xstart + kk*xnj*xni + jj*xni + ii;                    \
+
+#define hypre_RedBlackConstantcoefLoopEnd()                         \
+      }                                                             \
+   });                                                              \
+}
+
 #elif defined(HYPRE_USING_DEVICE_OPENMP)
 
 /* BEGIN OF OMP 4.5 */
diff --git a/src/struct_mv/_hypre_struct_mv.h b/src/struct_mv/_hypre_struct_mv.h
index 70dbdf9f41..8567df0cf6 100644
--- a/src/struct_mv/_hypre_struct_mv.h
+++ b/src/struct_mv/_hypre_struct_mv.h
@@ -35,7 +35,7 @@ extern "C" {
 #define HYPRE_MAXDIM 3
 #endif
 
-#if defined(HYPRE_USING_RAJA) || defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) || defined(HYPRE_USING_HIP)
+#if defined(HYPRE_USING_RAJA) || defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
 #define hypre_BoxLoopSetOneBlock()
 #else
 #define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock
diff --git a/src/struct_mv/box.h b/src/struct_mv/box.h
index eae0061331..8d2ad2db9a 100644
--- a/src/struct_mv/box.h
+++ b/src/struct_mv/box.h
@@ -18,7 +18,7 @@
 #define HYPRE_MAXDIM 3
 #endif
 
-#if defined(HYPRE_USING_RAJA) || defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) || defined(HYPRE_USING_HIP)
+#if defined(HYPRE_USING_RAJA) || defined(HYPRE_USING_KOKKOS) || defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
 #define hypre_BoxLoopSetOneBlock()
 #else
 #define hypre_BoxLoopSetOneBlock zypre_BoxLoopSetOneBlock
diff --git a/src/struct_mv/struct_communication.c b/src/struct_mv/struct_communication.c
index cb321ee9f7..d80e96620b 100644
--- a/src/struct_mv/struct_communication.c
+++ b/src/struct_mv/struct_communication.c
@@ -1218,7 +1218,7 @@ hypre_FinalizeCommunication( hypre_CommHandle *comm_handle )
 #if defined(HYPRE_USING_GPU)
 #if defined(HYPRE_USING_RAJA) || defined(HYPRE_USING_KOKKOS)
    alloc_dev_buffer = 1;
-#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) || defined(HYPRE_USING_SYCL)
    alloc_dev_buffer = (hypre_HandleStructExecPolicy(hypre_handle()) == HYPRE_EXEC_DEVICE);
 #elif defined(HYPRE_USING_DEVICE_OPENMP)
    alloc_dev_buffer = hypre__global_offload;

From 4fca1be4c19d0c22ea85f47683959f53523a6d2b Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Wed, 27 Oct 2021 21:40:26 +0000
Subject: [PATCH 22/44] Automatic selection of block dimension

---
 src/utilities/_hypre_utilities.h   |  1 +
 src/utilities/_hypre_utilities.hpp |  2 ++
 src/utilities/device_utils.c       | 11 +++++------
 src/utilities/device_utils.h       |  2 ++
 src/utilities/handle.h             |  1 +
 5 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h
index 7faf7f9a1d..7505d6277e 100644
--- a/src/utilities/_hypre_utilities.h
+++ b/src/utilities/_hypre_utilities.h
@@ -1278,6 +1278,7 @@ typedef struct
 #define hypre_HandleCubDevAllocator(hypre_handle)                hypre_DeviceDataCubDevAllocator(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleCubUvmAllocator(hypre_handle)                hypre_DeviceDataCubUvmAllocator(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleDevice(hypre_handle)                         hypre_DeviceDataDevice(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleDeviceMaxWorkGroupSize(hypre_handle)         hypre_DeviceDataDeviceMaxWorkGroupSize(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleComputeStreamNum(hypre_handle)               hypre_DeviceDataComputeStreamNum(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleReduceBuffer(hypre_handle)                   hypre_DeviceDataReduceBuffer(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleStructCommRecvBuffer(hypre_handle)           hypre_DeviceDataStructCommRecvBuffer(hypre_HandleDeviceData(hypre_handle))
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index 6fe8451f0f..4f062c7c06 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -290,6 +290,7 @@ struct hypre_DeviceData
 #endif
 #if defined(HYPRE_USING_SYCL)
    sycl::device                      device;
+   HYPRE_Int                         device_max_work_group_size;
 #else
    HYPRE_Int                         device;
 #endif
@@ -321,6 +322,7 @@ struct hypre_DeviceData
 #define hypre_DeviceDataCubDevAllocator(data)                ((data) -> cub_dev_allocator)
 #define hypre_DeviceDataCubUvmAllocator(data)                ((data) -> cub_uvm_allocator)
 #define hypre_DeviceDataDevice(data)                         ((data) -> device)
+#define hypre_DeviceDataDeviceMaxWorkGroupSize(data)         ((data) -> device_max_work_group_size)
 #define hypre_DeviceDataComputeStreamNum(data)               ((data) -> compute_stream_num)
 #define hypre_DeviceDataReduceBuffer(data)                   ((data) -> reduce_buffer)
 #define hypre_DeviceDataStructCommRecvBuffer(data)           ((data) -> struct_comm_recv_buffer)
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index e803495e9d..a845fe8303 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -11,10 +11,8 @@
 #if defined(HYPRE_USING_SYCL)
 sycl::range<1> hypre_GetDefaultCUDABlockDimension()
 {
-  // 256 - max work group size for Gen9
-  // 1024 - max work group size for ATS
-  sycl::range<1> wgDim(1024);
-  return wgDim;
+   sycl::range<1> wgDim(hypre_HandleDeviceMaxWorkGroupSize(hypre_handle()));
+   return wgDim;
 }
 
 // WM: TODO: verify
@@ -967,7 +965,7 @@ hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i)
             catch (sycl::exception const& ex)
             {
                std::cout << "Caught asynchronous SYCL exception:" << std::endl
-               << ex.what() << ", OpenCL code: " << ex.get_cl_code() << std::endl;
+               << ex.what() << ", OpenCL code: " << ex.code() << std::endl;
             }
          }
       };
@@ -1232,7 +1230,8 @@ hypre_DeviceDataCreate()
 
 #if defined(HYPRE_USING_SYCL)
    /* WM: does the default selector get a GPU if available? Having trouble with getting the device on frank, so temporarily just passing the default selector */
-   hypre_DeviceDataDevice(data)            = sycl::device(sycl::default_selector{});
+   hypre_DeviceDataDevice(data)                 = sycl::device(sycl::default_selector{});
+   hypre_DeviceDataDeviceMaxWorkGroupSize(data) = hypre_DeviceDataDevice(data).get_info<sycl::info::device::max_work_group_size>();
 #else
    hypre_DeviceDataDevice(data)            = 0;
 #endif
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index a442f2229f..7123aefaaf 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -233,6 +233,7 @@ struct hypre_DeviceData
 #endif
 #if defined(HYPRE_USING_SYCL)
    sycl::device                      device;
+   HYPRE_Int                         device_max_work_group_size;
 #else
    HYPRE_Int                         device;
 #endif
@@ -264,6 +265,7 @@ struct hypre_DeviceData
 #define hypre_DeviceDataCubDevAllocator(data)                ((data) -> cub_dev_allocator)
 #define hypre_DeviceDataCubUvmAllocator(data)                ((data) -> cub_uvm_allocator)
 #define hypre_DeviceDataDevice(data)                         ((data) -> device)
+#define hypre_DeviceDataDeviceMaxWorkGroupSize(data)         ((data) -> device_max_work_group_size)
 #define hypre_DeviceDataComputeStreamNum(data)               ((data) -> compute_stream_num)
 #define hypre_DeviceDataReduceBuffer(data)                   ((data) -> reduce_buffer)
 #define hypre_DeviceDataStructCommRecvBuffer(data)           ((data) -> struct_comm_recv_buffer)
diff --git a/src/utilities/handle.h b/src/utilities/handle.h
index 8e5979c7a2..2e3dc3198a 100644
--- a/src/utilities/handle.h
+++ b/src/utilities/handle.h
@@ -67,6 +67,7 @@ typedef struct
 #define hypre_HandleCubDevAllocator(hypre_handle)                hypre_DeviceDataCubDevAllocator(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleCubUvmAllocator(hypre_handle)                hypre_DeviceDataCubUvmAllocator(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleDevice(hypre_handle)                         hypre_DeviceDataDevice(hypre_HandleDeviceData(hypre_handle))
+#define hypre_HandleDeviceMaxWorkGroupSize(hypre_handle)         hypre_DeviceDataDeviceMaxWorkGroupSize(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleComputeStreamNum(hypre_handle)               hypre_DeviceDataComputeStreamNum(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleReduceBuffer(hypre_handle)                   hypre_DeviceDataReduceBuffer(hypre_HandleDeviceData(hypre_handle))
 #define hypre_HandleStructCommRecvBuffer(hypre_handle)           hypre_DeviceDataStructCommRecvBuffer(hypre_HandleDeviceData(hypre_handle))

From a6383e8cff5bb660eb0df5782e04de4a6c7babc6 Mon Sep 17 00:00:00 2001
From: Ruipeng Li <li50@llnl.gov>
Date: Wed, 27 Oct 2021 16:05:04 -0700
Subject: [PATCH 23/44] zboxloop

---
 src/test/zboxloop.c | 66 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 50 insertions(+), 16 deletions(-)

diff --git a/src/test/zboxloop.c b/src/test/zboxloop.c
index f506d7f432..2e4830fe0f 100644
--- a/src/test/zboxloop.c
+++ b/src/test/zboxloop.c
@@ -20,8 +20,6 @@
  * Test driver to time new boxloops and compare to the old ones
  *--------------------------------------------------------------------------*/
 
-#define DEVICE_VAR
-
 hypre_int
 main( hypre_int argc,
       char *argv[] )
@@ -39,6 +37,7 @@ main( hypre_int argc,
    //HYPRE_Int         xi1, xi2, xi3, xi4;
    HYPRE_Int         xi1;
    HYPRE_Real       *xp1, *xp2, *xp3, *xp4;
+   HYPRE_Real       *d_xp1, *d_xp2, *d_xp3, *d_xp4;
    hypre_Index       loop_size, start, unit_stride, index;
 
    /*-----------------------------------------------------------
@@ -51,6 +50,8 @@ main( hypre_int argc,
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs );
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
 
+   HYPRE_Init();
+
    /*-----------------------------------------------------------
     * Set defaults
     *-----------------------------------------------------------*/
@@ -65,6 +66,8 @@ main( hypre_int argc,
    Q  = 1;
    R  = 1;
 
+   reps = -1;
+
    /*-----------------------------------------------------------
     * Parse command line
     *-----------------------------------------------------------*/
@@ -92,6 +95,11 @@ main( hypre_int argc,
          arg_index++;
          dim = atoi(argv[arg_index++]);
       }
+      else if ( strcmp(argv[arg_index], "-reps") == 0 )
+      {
+         arg_index++;
+         reps = atoi(argv[arg_index++]);
+      }
       else if ( strcmp(argv[arg_index], "-help") == 0 )
       {
          print_usage = 1;
@@ -162,12 +170,20 @@ main( hypre_int argc,
    hypre_CopyBox(x1_data_box, x4_data_box);
 
    size = (nx+2)*(ny+2)*(nz+2);
-   xp1 = hypre_CTAlloc(HYPRE_Real,  size, HYPRE_MEMORY_HOST);
-   xp2 = hypre_CTAlloc(HYPRE_Real,  size, HYPRE_MEMORY_HOST);
-   xp3 = hypre_CTAlloc(HYPRE_Real,  size, HYPRE_MEMORY_HOST);
-   xp4 = hypre_CTAlloc(HYPRE_Real,  size, HYPRE_MEMORY_HOST);
+   xp1 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_HOST);
+   xp2 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_HOST);
+   xp3 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_HOST);
+   xp4 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_HOST);
+
+   d_xp1 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_DEVICE);
+   d_xp2 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_DEVICE);
+   d_xp3 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_DEVICE);
+   d_xp4 = hypre_CTAlloc(HYPRE_Real, size, HYPRE_MEMORY_DEVICE);
 
-   reps = 1000000000/(nx*ny*nz+1000);
+   if (reps < 0)
+   {
+      reps = 1000000000/(nx*ny*nz+1000);
+   }
 
    /*-----------------------------------------------------------
     * Print driver parameters
@@ -230,7 +246,7 @@ main( hypre_int argc,
    hypre_MPI_Barrier(hypre_MPI_COMM_WORLD);
 
    /*-----------------------------------------------------------
-    * Time old boxloops
+    * Time old boxloops [Device]
     *-----------------------------------------------------------*/
 
    /* Time BoxLoop0 */
@@ -239,12 +255,14 @@ main( hypre_int argc,
    for (rep = 0; rep < reps; rep++)
    {
       xi1 = 0;
+#define DEVICE_VAR is_device_ptr(d_xp1)
       hypre_BoxLoop0Begin(3, loop_size);
       {
-         xp1[xi1] += xp1[xi1];
+         d_xp1[xi1] += d_xp1[xi1];
          //xi1++;
       }
       hypre_BoxLoop0End();
+#undef DEVICE_VAR
    }
    hypre_EndTiming(time_index);
 
@@ -253,12 +271,14 @@ main( hypre_int argc,
    hypre_BeginTiming(time_index);
    for (rep = 0; rep < reps; rep++)
    {
+#define DEVICE_VAR is_device_ptr(d_xp1)
       hypre_BoxLoop1Begin(3, loop_size,
                           x1_data_box, start, unit_stride, xi1);
       {
-         xp1[xi1] += xp1[xi1];
+         d_xp1[xi1] += d_xp1[xi1];
       }
       hypre_BoxLoop1End(xi1);
+#undef DEVICE_VAR
    }
    hypre_EndTiming(time_index);
 
@@ -267,13 +287,15 @@ main( hypre_int argc,
    hypre_BeginTiming(time_index);
    for (rep = 0; rep < reps; rep++)
    {
+#define DEVICE_VAR is_device_ptr(d_xp1,d_xp2)
       hypre_BoxLoop2Begin(3, loop_size,
                           x1_data_box, start, unit_stride, xi1,
                           x2_data_box, start, unit_stride, xi2);
       {
-         xp1[xi1] += xp1[xi1] + xp2[xi2];
+         d_xp1[xi1] += d_xp1[xi1] + d_xp2[xi2];
       }
       hypre_BoxLoop2End(xi1, xi2);
+#undef DEVICE_VAR
    }
    hypre_EndTiming(time_index);
 
@@ -282,14 +304,16 @@ main( hypre_int argc,
    hypre_BeginTiming(time_index);
    for (rep = 0; rep < reps; rep++)
    {
+#define DEVICE_VAR is_device_ptr(d_xp1,d_xp2,d_xp3)
       hypre_BoxLoop3Begin(3, loop_size,
                           x1_data_box, start, unit_stride, xi1,
                           x2_data_box, start, unit_stride, xi2,
                           x3_data_box, start, unit_stride, xi3);
       {
-         xp1[xi1] += xp1[xi1] + xp2[xi2] + xp3[xi3];
+         d_xp1[xi1] += d_xp1[xi1] + d_xp2[xi2] + d_xp3[xi3];
       }
       hypre_BoxLoop3End(xi1, xi2, xi3);
+#undef DEVICE_VAR
    }
    hypre_EndTiming(time_index);
 
@@ -298,24 +322,26 @@ main( hypre_int argc,
    hypre_BeginTiming(time_index);
    for (rep = 0; rep < reps; rep++)
    {
+#define DEVICE_VAR is_device_ptr(d_xp1,d_xp2,d_xp3,d_xp4)
       hypre_BoxLoop4Begin(3, loop_size,
                           x1_data_box, start, unit_stride, xi1,
                           x2_data_box, start, unit_stride, xi2,
                           x3_data_box, start, unit_stride, xi3,
                           x4_data_box, start, unit_stride, xi4);
       {
-         xp1[xi1] += xp1[xi1] + xp2[xi2] + xp3[xi3] + xp4[xi4];
+         d_xp1[xi1] += d_xp1[xi1] + d_xp2[xi2] + d_xp3[xi3] + d_xp4[xi4];
       }
       hypre_BoxLoop4End(xi1, xi2, xi3, xi4);
+#undef DEVICE_VAR
    }
    hypre_EndTiming(time_index);
 
-   hypre_PrintTiming("Old BoxLoop times", hypre_MPI_COMM_WORLD);
+   hypre_PrintTiming("Old BoxLoop times [DEVICE]", hypre_MPI_COMM_WORLD);
    hypre_FinalizeTiming(time_index);
    hypre_ClearTiming();
 
    /*-----------------------------------------------------------
-    * Time new boxloops
+    * Time new boxloops [Host]
     *-----------------------------------------------------------*/
 
    /* Time BoxLoop0 */
@@ -415,7 +441,7 @@ main( hypre_int argc,
    }
    hypre_EndTiming(time_index);
 
-   hypre_PrintTiming("New BoxLoop times", hypre_MPI_COMM_WORLD);
+   hypre_PrintTiming("New BoxLoop times [HOST]", hypre_MPI_COMM_WORLD);
    hypre_FinalizeTiming(time_index);
    hypre_ClearTiming();
 
@@ -427,11 +453,19 @@ main( hypre_int argc,
    hypre_BoxDestroy(x2_data_box);
    hypre_BoxDestroy(x3_data_box);
    hypre_BoxDestroy(x4_data_box);
+
    hypre_TFree(xp1, HYPRE_MEMORY_HOST);
    hypre_TFree(xp2, HYPRE_MEMORY_HOST);
    hypre_TFree(xp3, HYPRE_MEMORY_HOST);
    hypre_TFree(xp4, HYPRE_MEMORY_HOST);
 
+   hypre_TFree(d_xp1, HYPRE_MEMORY_DEVICE);
+   hypre_TFree(d_xp2, HYPRE_MEMORY_DEVICE);
+   hypre_TFree(d_xp3, HYPRE_MEMORY_DEVICE);
+   hypre_TFree(d_xp4, HYPRE_MEMORY_DEVICE);
+
+   HYPRE_Finalize();
+
    /* Finalize MPI */
    hypre_MPI_Finalize();
 

From 4ddcc4a27e62dc86f9c1484be020c045f833bc0d Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Fri, 29 Oct 2021 17:23:41 +0000
Subject: [PATCH 24/44] Fixes for compiler update on jlse

---
 src/test/Makefile            | 1 -
 src/utilities/device_utils.c | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/test/Makefile b/src/test/Makefile
index 975e702290..8f5cedba35 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -65,7 +65,6 @@ LFLAGS =\
 
 HYPRE_DRIVERS =\
  ij.c\
- simple.c\
  ij_assembly.c\
  sstruct.c\
  struct.c\
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index a845fe8303..8dd1092508 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -965,7 +965,7 @@ hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i)
             catch (sycl::exception const& ex)
             {
                std::cout << "Caught asynchronous SYCL exception:" << std::endl
-               << ex.what() << ", OpenCL code: " << ex.code() << std::endl;
+               << ex.what() << ", OpenCL code: " << ex.get_cl_code() << std::endl;
             }
          }
       };

From 345b0d04a61d9f86faf32bb6e29b406ba48af9d2 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Fri, 29 Oct 2021 20:29:13 +0000
Subject: [PATCH 25/44] Renamings

---
 src/IJ_mv/IJMatrix_parcsr_device.c      |  8 +--
 src/IJ_mv/IJVector_parcsr_device.c      |  4 +-
 src/parcsr_ls/ads.c                     | 20 +++----
 src/parcsr_ls/ame.c                     |  4 +-
 src/parcsr_ls/ams.c                     | 72 ++++++++++++-------------
 src/parcsr_ls/par_2s_interp_device.c    | 14 ++---
 src/parcsr_ls/par_coarsen_device.c      |  8 +--
 src/parcsr_ls/par_gauss_elim.c          |  2 +-
 src/parcsr_ls/par_indepset_device.c     |  6 +--
 src/parcsr_ls/par_interp_device.c       |  8 +--
 src/parcsr_ls/par_interp_trunc_device.c |  4 +-
 src/parcsr_ls/par_lr_interp_device.c    | 20 +++----
 src/parcsr_ls/par_lr_restr_device.c     |  4 +-
 src/parcsr_ls/par_relax_more_device.c   |  4 +-
 src/parcsr_ls/par_strength_device.c     |  4 +-
 src/parcsr_mv/par_csr_matop_device.c    | 14 ++---
 src/seq_mv/csr_matop_device.c           | 28 +++++-----
 src/seq_mv/csr_spgemm_device_util.c     |  6 +--
 src/struct_mv/_hypre_struct_mv.hpp      | 16 +++---
 src/struct_mv/boxloop_cuda.h            |  8 +--
 src/struct_mv/boxloop_sycl.h            |  8 +--
 src/utilities/_hypre_utilities.hpp      |  8 +--
 src/utilities/device_utils.c            | 45 ++++++++--------
 src/utilities/device_utils.h            |  8 +--
 24 files changed, 161 insertions(+), 162 deletions(-)

diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c
index 157701dcee..1760f3f0db 100644
--- a/src/IJ_mv/IJMatrix_parcsr_device.c
+++ b/src/IJ_mv/IJMatrix_parcsr_device.c
@@ -153,8 +153,8 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix       *matrix,
       HYPRE_Int *indicator = hypre_CTAlloc(HYPRE_Int, len, HYPRE_MEMORY_DEVICE);
       hypreDevice_CsrRowPtrsToIndices_v2(nrows-1, len1, (HYPRE_Int *) row_indexes, indicator);
       /* mark unwanted elements as -1 */
-      dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-      dim3 gDim = hypre_GetDefaultCUDAGridDimension(len1, "thread", bDim);
+      dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+      dim3 gDim = hypre_GetDefaultDeviceGridDimension(len1, "thread", bDim);
       HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator, (HYPRE_Int *) row_indexes, ncols, indicator );
 
       auto new_end = HYPRE_THRUST_CALL(
@@ -216,8 +216,8 @@ hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int  N0, HYPRE_BigInt  *I0, HYPRE_Big
    HYPRE_Complex *A = hypre_TAlloc(HYPRE_Complex, N0, HYPRE_MEMORY_DEVICE);
 
    /*
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(N0, "thread", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(N0, "thread", bDim);
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 );
    */
 
diff --git a/src/IJ_mv/IJVector_parcsr_device.c b/src/IJ_mv/IJVector_parcsr_device.c
index a57bd9362c..b9afa8c67b 100644
--- a/src/IJ_mv/IJVector_parcsr_device.c
+++ b/src/IJ_mv/IJVector_parcsr_device.c
@@ -231,8 +231,8 @@ hypre_IJVectorAssembleParDevice(hypre_IJVector *vector)
       hypre_IJVectorAssembleSortAndReduce1(nelms, stack_i, stack_sora, stack_data, &new_nnz, &new_i, &new_sora, &new_data);
 
       /* set/add to local vector */
-      dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-      dim3 gDim = hypre_GetDefaultCUDAGridDimension(new_nnz, "thread", bDim);
+      dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+      dim3 gDim = hypre_GetDefaultDeviceGridDimension(new_nnz, "thread", bDim);
       HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i, vec_start, new_sora,
                          hypre_VectorData(hypre_ParVectorLocalVector(par_vector)) );
 
diff --git a/src/parcsr_ls/ads.c b/src/parcsr_ls/ads.c
index 63a2c0f32f..e8e87b9047 100644
--- a/src/parcsr_ls/ads.c
+++ b/src/parcsr_ls/ads.c
@@ -573,13 +573,13 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A,
                                Pi_diag_I,
                                3 * _1 );
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(F2V_diag_nnz, "thread", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nnz, "thread", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                F2V_diag_nnz, 3, F2V_diag_J, Pi_diag_J );
 
-            gDim = hypre_GetDefaultCUDAGridDimension(F2V_diag_nrows, "warp", bDim);
+            gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
                                F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data,
@@ -635,13 +635,13 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A,
                                   3 * _1 );
             }
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(F2V_offd_nnz, "thread", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nnz, "thread", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                F2V_offd_nnz, 3, F2V_offd_J, Pi_offd_J );
 
-            gDim = hypre_GetDefaultCUDAGridDimension(F2V_offd_nrows, "warp", bDim);
+            gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
                                F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data,
@@ -843,8 +843,8 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A,
                                F2V_diag_nnz,
                                thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_J, Piy_diag_J, Piz_diag_J)) );
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(F2V_diag_nrows, "warp", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data,
@@ -923,8 +923,8 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A,
                                F2V_offd_nnz,
                                thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_J, Piy_offd_J, Piz_offd_J)) );
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(F2V_offd_nrows, "warp", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data,
diff --git a/src/parcsr_ls/ame.c b/src/parcsr_ls/ame.c
index fd34f4e189..eea0c6f9ae 100644
--- a/src/parcsr_ls/ame.c
+++ b/src/parcsr_ls/ame.c
@@ -465,8 +465,8 @@ HYPRE_Int hypre_AMESetup(void *esolver)
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
          if (exec == HYPRE_EXEC_DEVICE)
          {
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(nv, "warp", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim);
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim,
                                nv, GtdI, GtdJ, GtdA, GtoI, GtoJ, GtoA, edge_bc, offd_edge_bc );
          }
diff --git a/src/parcsr_ls/ams.c b/src/parcsr_ls/ams.c
index 7262ae0256..01fe07450d 100644
--- a/src/parcsr_ls/ams.c
+++ b/src/parcsr_ls/ams.c
@@ -190,8 +190,8 @@ HYPRE_Int hypre_ParVectorBlockSplit(hypre_ParVector *x,
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
    if (exec == HYPRE_EXEC_DEVICE)
    {
-      dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-      dim3 gDim = hypre_GetDefaultCUDAGridDimension(size_ * dim, "thread", bDim);
+      dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+      dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim);
       HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
                          size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
    }
@@ -233,8 +233,8 @@ HYPRE_Int hypre_ParVectorBlockGather(hypre_ParVector *x,
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
    if (exec == HYPRE_EXEC_DEVICE)
    {
-      dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-      dim3 gDim = hypre_GetDefaultCUDAGridDimension(size_ * dim, "thread", bDim);
+      dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+      dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim);
       HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
                          size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
    }
@@ -433,8 +433,8 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A)
    HYPRE_Int        num_cols_offd = hypre_CSRMatrixNumCols(A_offd);
    dim3             bDim, gDim;
 
-   bDim = hypre_GetDefaultCUDABlockDimension();
-   gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim);
+   bDim = hypre_GetDefaultDeviceBlockDimension();
+   gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
    HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
                      nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd);
@@ -761,8 +761,8 @@ HYPRE_Int hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix *A, HYPRE_Real d)
    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
    if (exec == HYPRE_EXEC_DEVICE)
    {
-      dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-      dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_rows, "thread", bDim);
+      dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+      dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
       HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
                          num_rows, A_diag_I, A_diag_J, A_diag_data, A_offd_I, num_cols_offd, d);
    }
@@ -1536,13 +1536,13 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
                                Pi_diag_I,
                                dim * _1 );
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nnz, "thread", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                G_diag_nnz, dim, G_diag_J, Pi_diag_J );
 
-            gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
+            gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
@@ -1601,13 +1601,13 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
                                   dim * _1 );
             }
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nnz, "thread", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                G_offd_nnz, dim, G_offd_J, Pi_offd_J );
 
-            gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
+            gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
@@ -1835,8 +1835,8 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
                                G_diag_nnz,
                                thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_J, Piy_diag_J, Piz_diag_J)) );
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
@@ -1901,8 +1901,8 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
                                G_diag_nnz,
                                thrust::make_zip_iterator(thrust::make_tuple(Pix_diag_J, Piy_diag_J)) );
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, NULL,
@@ -1959,8 +1959,8 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
                                G_diag_nnz,
                                Pix_diag_J );
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, NULL, NULL,
@@ -2036,8 +2036,8 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
                                G_offd_nnz,
                                thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_J, Piy_offd_J, Piz_offd_J)) );
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
@@ -2118,8 +2118,8 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
                                G_offd_nnz,
                                thrust::make_zip_iterator(thrust::make_tuple(Pix_offd_J, Piy_offd_J)) );
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, NULL,
@@ -2190,8 +2190,8 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
                                G_offd_nnz,
                                Pix_offd_J );
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, NULL, NULL,
@@ -2382,13 +2382,13 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
                                GPi_diag_I,
                                dim * _1 );
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nnz, "thread", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                G_diag_nnz, dim, G_diag_J, GPi_diag_J );
 
-            gDim = hypre_GetDefaultCUDAGridDimension(G_diag_nrows, "warp", bDim);
+            gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
@@ -2448,13 +2448,13 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
                                   dim * _1 );
             }
 
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nnz, "thread", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                G_offd_nnz, dim, G_offd_J, GPi_offd_J );
 
-            gDim = hypre_GetDefaultCUDAGridDimension(G_offd_nrows, "warp", bDim);
+            gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
@@ -2679,8 +2679,8 @@ HYPRE_Int hypre_AMSSetup(void *solver,
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
          if (exec == HYPRE_EXEC_DEVICE)
          {
-            dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-            dim3 gDim = hypre_GetDefaultCUDAGridDimension(nv, "warp", bDim);
+            dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+            dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim);
             HYPRE_CUDA_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
                                nv, G0tdI, G0tdA, G0toI, G0toA, interior_nodes_data );
          }
@@ -3244,8 +3244,8 @@ HYPRE_Int hypre_AMSSetup(void *solver,
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
                   if (exec == HYPRE_EXEC_DEVICE)
                   {
-                     dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-                     dim3 gDim = hypre_GetDefaultCUDAGridDimension(Gt_num_rows, "warp", bDim);
+                     dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+                     dim3 gDim = hypre_GetDefaultDeviceGridDimension(Gt_num_rows, "warp", bDim);
                      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
                            Gt_num_rows, Gt_diag_I, Gt_diag_J, Gt_diag_data, Gt_offd_I, Gt_offd_data,
                            Gx_data, Gy_data, Gz_data );
diff --git a/src/parcsr_ls/par_2s_interp_device.c b/src/parcsr_ls/par_2s_interp_device.c
index eab19cdd7e..15a497a04b 100644
--- a/src/parcsr_ls/par_2s_interp_device.c
+++ b/src/parcsr_ls/par_2s_interp_device.c
@@ -89,8 +89,8 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix  *A,
    /* weak row sum and diagonal, i.e., DF2F2 + Dgamma */
    rsWA = hypre_TAlloc(HYPRE_Complex, A_nr_local, HYPRE_MEMORY_DEVICE);
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(A_nr_local, "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim);
 
    /* only for rows corresponding to F2 (notice flag == -1) */
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
@@ -142,7 +142,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix  *A,
 
    /* add to rsW those in AF2F that correspond to Dbeta == 0
     * diagnoally scale As_F2F (from both sides) and replace the diagonal */
-   gDim = hypre_GetDefaultCUDAGridDimension(AF2F_nr_local, "warp", bDim);
+   gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF,
                       gDim, bDim,
@@ -304,8 +304,8 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix  *A,
 
    hypre_assert(AFC_nr_local == hypre_ParCSRMatrixNumRows(As_FF));
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(AFC_nr_local, "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(AFC_nr_local, "warp", bDim);
 
    /* Generate D_lambda in the paper: D_beta + (row sum of AFF without diagonal elements / row_nnz) */
    /* Generate D_tmp, i.e., D_mu / D_lambda */
@@ -364,7 +364,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix  *A,
    /* weak row sum and diagonal, i.e., DFF + Dgamma */
    rsWA = hypre_TAlloc(HYPRE_Complex, A_nr_local, HYPRE_MEMORY_DEVICE);
 
-   gDim = hypre_GetDefaultCUDAGridDimension(A_nr_local, "warp", bDim);
+   gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim);
 
    /* only for rows corresponding to F2 (notice flag == -1) */
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
@@ -415,7 +415,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix  *A,
 
    /* add to rsW those in AFF that correspond to lam == 0
     * diagnoally scale As_F2F (from both sides) and replace the diagonal */
-   gDim = hypre_GetDefaultCUDAGridDimension(AF2F_nr_local, "warp", bDim);
+   gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF,
                       gDim, bDim,
diff --git a/src/parcsr_ls/par_coarsen_device.c b/src/parcsr_ls/par_coarsen_device.c
index 38764b619f..6c30741003 100644
--- a/src/parcsr_ls/par_coarsen_device.c
+++ b/src/parcsr_ls/par_coarsen_device.c
@@ -317,8 +317,8 @@ hypre_PMISCoarseningInitDevice( hypre_ParCSRMatrix  *S,               /* in */
    HYPRE_Int        num_sends     = hypre_ParCSRCommPkgNumSends(comm_pkg);
 
    dim3 bDim, gDim;
-   bDim = hypre_GetDefaultCUDABlockDimension();
-   gDim = hypre_GetDefaultCUDAGridDimension(num_rows_diag, "thread", bDim);
+   bDim = hypre_GetDefaultDeviceBlockDimension();
+   gDim = hypre_GetDefaultDeviceGridDimension(num_rows_diag, "thread", bDim);
 
    hypre_ParCSRCommHandle *comm_handle;
    HYPRE_Int *new_end;
@@ -484,8 +484,8 @@ hypre_PMISCoarseningUpdateCFDevice( hypre_ParCSRMatrix  *S,               /* in
    HYPRE_Int        num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
 
    dim3 bDim, gDim;
-   bDim = hypre_GetDefaultCUDABlockDimension();
-   gDim = hypre_GetDefaultCUDAGridDimension(graph_diag_size, "warp", bDim);
+   bDim = hypre_GetDefaultDeviceBlockDimension();
+   gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF,
                       gDim, bDim,
diff --git a/src/parcsr_ls/par_gauss_elim.c b/src/parcsr_ls/par_gauss_elim.c
index fa6f58ac19..2a8c9f6189 100644
--- a/src/parcsr_ls/par_gauss_elim.c
+++ b/src/parcsr_ls/par_gauss_elim.c
@@ -418,7 +418,7 @@ hypreCUDAKernel_dgemv(HYPRE_Int   m,
 HYPRE_Int hypre_dgemv_device(HYPRE_Int m, HYPRE_Int n, HYPRE_Int lda, HYPRE_Real *a, HYPRE_Real *x, HYPRE_Real *y)
 {
    dim3 bDim(BLOCK_SIZE, 1, 1);
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(m, "thread", bDim);
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(m, "thread", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y );
 
diff --git a/src/parcsr_ls/par_indepset_device.c b/src/parcsr_ls/par_indepset_device.c
index d031b6936b..bfebafebc1 100644
--- a/src/parcsr_ls/par_indepset_device.c
+++ b/src/parcsr_ls/par_indepset_device.c
@@ -167,8 +167,8 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix  *S,
    /*-------------------------------------------------------
     * Remove nodes from the initial independent set
     *-------------------------------------------------------*/
-   bDim = hypre_GetDefaultCUDABlockDimension();
-   gDim = hypre_GetDefaultCUDAGridDimension(graph_diag_size, "warp", bDim);
+   bDim = hypre_GetDefaultDeviceBlockDimension();
+   gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim,
                       graph_diag_size, graph_diag, measure_diag, measure_offd,
@@ -184,7 +184,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix  *S,
    hypre_ParCSRCommHandleDestroy(comm_handle);
 
    /* adjust IS_marker_diag from the received */
-   gDim = hypre_GetDefaultCUDAGridDimension(num_elmts_send, "thread", bDim);
+   gDim = hypre_GetDefaultDeviceGridDimension(num_elmts_send, "thread", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim,
                       IS_marker_diag, num_elmts_send, send_map_elmts,
diff --git a/src/parcsr_ls/par_interp_device.c b/src/parcsr_ls/par_interp_device.c
index 714f846674..a7e5476ffb 100644
--- a/src/parcsr_ls/par_interp_device.c
+++ b/src/parcsr_ls/par_interp_device.c
@@ -175,8 +175,8 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix   *A,
    P_diag_i = hypre_TAlloc(HYPRE_Int, n_fine+1, memory_location);
    P_offd_i = hypre_TAlloc(HYPRE_Int, n_fine+1, memory_location);
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(n_fine, "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
 
    HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim,
                       n_fine, S_diag_i, S_diag_j, S_offd_i, S_offd_j,
@@ -1124,8 +1124,8 @@ hypre_BoomerAMGBuildInterpOnePntDevice( hypre_ParCSRMatrix  *A,
    P_diag_j_temp = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
    P_offd_j_temp = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_DEVICE);
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(n_fine, "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
 
    HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim,
                       n_fine, A_diag_i, A_strong_diag_j, A_diag_a, A_offd_i, A_strong_offd_j,
diff --git a/src/parcsr_ls/par_interp_trunc_device.c b/src/parcsr_ls/par_interp_trunc_device.c
index 30fc5147d2..4524f91f9e 100644
--- a/src/parcsr_ls/par_interp_trunc_device.c
+++ b/src/parcsr_ls/par_interp_trunc_device.c
@@ -156,8 +156,8 @@ hypre_BoomerAMGInterpTruncationDevice( hypre_ParCSRMatrix *P, HYPRE_Real trunc_f
    hypreDevice_CsrRowIndicesToPtrs_v2(nrows, nnz_P, P_i, P_rowptr);
 
    /* truncate P, unwanted entries are marked -1 in P_j */
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim,
                       nrows, trunc_factor, max_elmts, P_rowptr, P_j, P_a );
diff --git a/src/parcsr_ls/par_lr_interp_device.c b/src/parcsr_ls/par_lr_interp_device.c
index 84e1ba4e21..43ac592e95 100644
--- a/src/parcsr_ls/par_lr_interp_device.c
+++ b/src/parcsr_ls/par_lr_interp_device.c
@@ -66,8 +66,8 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix  *A,
    /* row sum of A-weak + Diag(A), i.e., (D_gamma + D_alpha) in the notes, only for F-pts */
    rsWA = hypre_TAlloc(HYPRE_Complex, A_nr_of_rows, HYPRE_MEMORY_DEVICE);
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(A_nr_of_rows, "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
@@ -109,7 +109,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix  *A,
    /* 5. Form matrix ~{A_FF}, (return twAFF in AFF data structure ) */
    /* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */
    hypre_GpuProfilingPushRange("Compute interp matrix");
-   gDim = hypre_GetDefaultCUDAGridDimension(W_nr_of_rows, "warp", bDim);
+   gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc,
                       gDim, bDim,
                       W_nr_of_rows,
@@ -252,8 +252,8 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix  *A,
    /* row sum of A-weak + Diag(A), i.e., (D_gamma + D_alpha) in the notes, only for F-pts */
    rsWA = hypre_TAlloc(HYPRE_Complex, A_nr_of_rows, HYPRE_MEMORY_DEVICE);
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(A_nr_of_rows, "warp",   bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp",   bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
@@ -329,7 +329,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix  *A,
                       AFF_diag_data_old );
 
    hypre_GpuProfilingPushRange("Compute interp matrix");
-   gDim = hypre_GetDefaultCUDAGridDimension(W_nr_of_rows, "warp", bDim);
+   gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_twiaff_w,
                       gDim, bDim,
                       W_nr_of_rows,
@@ -477,8 +477,8 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix  *A,
    /* row sum of A-weak + Diag(A), i.e., (D_gamma + D_FF) in the notes, only for F-pts */
    rsWA = hypre_TAlloc(HYPRE_Complex, A_nr_of_rows, HYPRE_MEMORY_DEVICE);
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(A_nr_of_rows, "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
@@ -522,7 +522,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix  *A,
    dlam = hypre_TAlloc(HYPRE_Complex, W_nr_of_rows, HYPRE_MEMORY_DEVICE);
    dtmp = hypre_TAlloc(HYPRE_Complex, W_nr_of_rows, HYPRE_MEMORY_DEVICE);
    hypre_GpuProfilingPushRange("Compute D_tmp");
-   gDim = hypre_GetDefaultCUDAGridDimension(W_nr_of_rows, "warp", bDim);
+   gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
                       gDim, bDim,
                       W_nr_of_rows,
@@ -562,7 +562,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix  *A,
    /* 5. Form matrix ~{A_FF}, (return twAFF in AFF data structure ) */
    /* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */
    hypre_GpuProfilingPushRange("Compute interp matrix");
-   gDim = hypre_GetDefaultCUDAGridDimension(W_nr_of_rows, "warp", bDim);
+   gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe,
                       gDim, bDim,
                       W_nr_of_rows,
diff --git a/src/parcsr_ls/par_lr_restr_device.c b/src/parcsr_ls/par_lr_restr_device.c
index ff5e6450a3..104ec87451 100644
--- a/src/parcsr_ls/par_lr_restr_device.c
+++ b/src/parcsr_ls/par_lr_restr_device.c
@@ -245,8 +245,8 @@ hypre_BoomerAMGBuildRestrNeumannAIRDevice( hypre_ParCSRMatrix   *A,
                         thrust::plus<HYPRE_Int>() );
 
    /* assemble the diagonal part of R from Z */
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(n_fine, "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
    HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim,
                       n_cpts, Fmap, Cmap, Z_diag_i, Z_diag_j, Z_diag_a, R_diag_i, R_diag_j, R_diag_a);
 
diff --git a/src/parcsr_ls/par_relax_more_device.c b/src/parcsr_ls/par_relax_more_device.c
index 6cf1769b62..657905f3d9 100644
--- a/src/parcsr_ls/par_relax_more_device.c
+++ b/src/parcsr_ls/par_relax_more_device.c
@@ -151,8 +151,8 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A,
 
    dim3 bDim, gDim;
 
-   bDim = hypre_GetDefaultCUDABlockDimension();
-   gDim = hypre_GetDefaultCUDAGridDimension(A_num_rows, "warp", bDim);
+   bDim = hypre_GetDefaultDeviceBlockDimension();
+   gDim = hypre_GetDefaultDeviceGridDimension(A_num_rows, "warp", bDim);
    HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate,
                      gDim,
                      bDim,
diff --git a/src/parcsr_ls/par_strength_device.c b/src/parcsr_ls/par_strength_device.c
index 196ebd0051..a2ca43fc8e 100644
--- a/src/parcsr_ls/par_strength_device.c
+++ b/src/parcsr_ls/par_strength_device.c
@@ -134,8 +134,8 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix    *A,
    }
 
    /* count the row nnz of S */
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_variables, "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_variables, "warp", bDim);
 
    if (abs_soc)
    {
diff --git a/src/parcsr_mv/par_csr_matop_device.c b/src/parcsr_mv/par_csr_matop_device.c
index c0d44f3d2e..251e28d3a6 100644
--- a/src/parcsr_mv/par_csr_matop_device.c
+++ b/src/parcsr_mv/par_csr_matop_device.c
@@ -617,8 +617,8 @@ hypre_ConcatDiagAndOffdDevice(hypre_ParCSRMatrix *A)
                       hypre_CSRMatrixI(B) + hypre_CSRMatrixNumRows(B) + 1,
                       hypre_CSRMatrixI(B) );
 
-   const dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   const dim3 gDim = hypre_GetDefaultCUDAGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim);
+   const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   const dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
                       gDim, bDim,
@@ -732,8 +732,8 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A,
                       hypre_CSRMatrixI(B) + hypre_ParCSRMatrixNumRows(A) + 1,
                       hypre_CSRMatrixI(B) );
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(hypre_ParCSRMatrixNumRows(A), "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_ParCSRMatrixNumRows(A), "warp", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
                       gDim, bDim,
@@ -761,7 +761,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A,
                       hypre_CSRMatrixI(B) + hypre_ParCSRMatrixNumRows(A) + 1,
                       thrust::plus<HYPRE_Int>() );
 
-   gDim = hypre_GetDefaultCUDAGridDimension(hypre_CSRMatrixNumRows(E), "warp", bDim);
+   gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(E), "warp", bDim);
 
    hypre_assert(hypre_CSRMatrixNumCols(E_diag) == hypre_CSRMatrixNumCols(A_diag));
 
@@ -1192,8 +1192,8 @@ hypre_ParCSRMatrixDropSmallEntriesDevice( hypre_ParCSRMatrix *A,
       elmt_tols_offd = hypre_TAlloc(HYPRE_Real, hypre_CSRMatrixNumNonzeros(A_offd), HYPRE_MEMORY_DEVICE);
    }
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim);
 
    if (type == -1)
    {
diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c
index e4a5c98e74..9b428d0553 100644
--- a/src/seq_mv/csr_matop_device.c
+++ b/src/seq_mv/csr_matop_device.c
@@ -653,8 +653,8 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
    HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
    dim3           bDim, gDim;
 
-   bDim = hypre_GetDefaultCUDABlockDimension();
-   gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim);
+   bDim = hypre_GetDefaultDeviceBlockDimension();
+   gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
    HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMoveDiagFirst, gDim, bDim,
                      nrows, A_i, A_j, A_data);
@@ -689,8 +689,8 @@ hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
       return 0;
    }
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim);
 
    HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRCheckDiagFirst, gDim, bDim,
@@ -778,8 +778,8 @@ hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
       return ierr;
    }
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
 
 #if HYPRE_DEBUG
    HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
@@ -873,8 +873,8 @@ hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
       return ierr;
    }
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
 
 #if HYPRE_DEBUG
    HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
@@ -1072,8 +1072,8 @@ hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
    HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
    dim3           bDim, gDim;
 
-   bDim = hypre_GetDefaultCUDABlockDimension();
-   gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim);
+   bDim = hypre_GetDefaultDeviceBlockDimension();
+   gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
    if (type == 0)
    {
@@ -1179,8 +1179,8 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
    HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
    dim3           bDim, gDim;
 
-   bDim = hypre_GetDefaultCUDABlockDimension();
-   gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim);
+   bDim = hypre_GetDefaultDeviceBlockDimension();
+   gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
 
@@ -1449,8 +1449,8 @@ hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A,
 
    hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(nnzA + nnzB, "thread", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixIntersectPattern, gDim, bDim,
                       nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt );
diff --git a/src/seq_mv/csr_spgemm_device_util.c b/src/seq_mv/csr_spgemm_device_util.c
index 63270e43b4..9514be1f1a 100644
--- a/src/seq_mv/csr_spgemm_device_util.c
+++ b/src/seq_mv/csr_spgemm_device_util.c
@@ -97,19 +97,19 @@ hypre_SpGemmCreateGlobalHashTable( HYPRE_Int       num_rows,        /* number of
    hypre_assert(type == 2 || num_ghash <= num_rows);
 
    HYPRE_Int *ghash_i, ghash_size;
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
 
    if (type == 1)
    {
       ghash_i = hypre_TAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE);
-      dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_ghash, "thread", bDim);
+      dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_ghash, "thread", bDim);
       HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim,
                          num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE );
    }
    else if (type == 2)
    {
       ghash_i = hypre_CTAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE);
-      dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_rows, "thread", bDim);
+      dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
       HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim,
                          num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE );
    }
diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index e48daf8bf2..39a897a9a9 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -794,8 +794,8 @@ BoxLoopforall( HYPRE_Int length,
    }
    else if (exec_policy == HYPRE_EXEC_DEVICE)
    {
-      const dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-      const dim3 gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+      const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+      const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
       HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
    }
@@ -845,8 +845,8 @@ ReductionBoxLoopforall( HYPRE_Int  length,
    }
    else if (exec_policy == HYPRE_EXEC_DEVICE)
    {
-      const dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-      dim3 gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+      const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+      dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
       /* Note: we assume gDim cannot exceed 1024
        *       and bDim < WARP * WARP
@@ -1210,8 +1210,8 @@ BoxLoopforall( HYPRE_Int length,
    else if (exec_policy == HYPRE_EXEC_DEVICE)
    {
       /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */
-      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
-      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+      const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
+      const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
       hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
          {
@@ -1247,8 +1247,8 @@ ReductionBoxLoopforall( LOOP_BODY  loop_body,
    {
       /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */
       /* NOTE: in the cuda version, there is further manipulation of bDim and gDim that I don't include here */
-      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
-      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+      const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
+      const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
       hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
          {
diff --git a/src/struct_mv/boxloop_cuda.h b/src/struct_mv/boxloop_cuda.h
index a5e54462c6..cd477fe2eb 100644
--- a/src/struct_mv/boxloop_cuda.h
+++ b/src/struct_mv/boxloop_cuda.h
@@ -70,8 +70,8 @@ BoxLoopforall( HYPRE_Int length,
    }
    else if (exec_policy == HYPRE_EXEC_DEVICE)
    {
-      const dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-      const dim3 gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+      const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+      const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
       HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
    }
@@ -121,8 +121,8 @@ ReductionBoxLoopforall( HYPRE_Int  length,
    }
    else if (exec_policy == HYPRE_EXEC_DEVICE)
    {
-      const dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-      dim3 gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+      const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+      dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
       /* Note: we assume gDim cannot exceed 1024
        *       and bDim < WARP * WARP
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index 02c90e6331..1527f3f1b1 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -58,8 +58,8 @@ BoxLoopforall( HYPRE_Int length,
    else if (exec_policy == HYPRE_EXEC_DEVICE)
    {
       /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */
-      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
-      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+      const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
+      const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
       hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
          {
@@ -95,8 +95,8 @@ ReductionBoxLoopforall( LOOP_BODY  loop_body,
    {
       /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */
       /* NOTE: in the cuda version, there is further manipulation of bDim and gDim that I don't include here */
-      const sycl::range<1> bDim = hypre_GetDefaultCUDABlockDimension();
-      const sycl::range<1> gDim = hypre_GetDefaultCUDAGridDimension(length, "thread", bDim);
+      const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
+      const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
       hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
          {
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index 4f062c7c06..d8276869d7 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -523,9 +523,9 @@ hypre_int hypre_cuda_get_grid_warp_id(sycl::nd_item<dim>& item)
 }
 
 /* device_utils.c */
-sycl::range<1> hypre_GetDefaultCUDABlockDimension();
+sycl::range<1> hypre_GetDefaultDeviceBlockDimension();
 
-sycl::range<1> hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, sycl::range<1> bDim );
+sycl::range<1> hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, sycl::range<1> bDim );
 
 #endif // #if defined(HYPRE_USING_SYCL)
 
@@ -1106,9 +1106,9 @@ struct print_functor
 };
 
 /* device_utils.c */
-dim3 hypre_GetDefaultCUDABlockDimension();
+dim3 hypre_GetDefaultDeviceBlockDimension();
 
-dim3 hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim );
+dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim );
 
 template <typename T1, typename T2, typename T3> HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *vals, HYPRE_Int opt);
 
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index 8dd1092508..72a30c73be 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -9,14 +9,13 @@
 #include "_hypre_utilities.hpp"
 
 #if defined(HYPRE_USING_SYCL)
-sycl::range<1> hypre_GetDefaultCUDABlockDimension()
+sycl::range<1> hypre_GetDefaultDeviceBlockDimension()
 {
    sycl::range<1> wgDim(hypre_HandleDeviceMaxWorkGroupSize(hypre_handle()));
    return wgDim;
 }
 
-// WM: TODO: verify
-sycl::range<1> hypre_GetDefaultCUDAGridDimension(HYPRE_Int n,
+sycl::range<1> hypre_GetDefaultDeviceGridDimension(HYPRE_Int n,
                                                  const char *granularity,
                                                  sycl::range<1> wgDim)
 {
@@ -110,7 +109,7 @@ void hypre_CudaCompileFlagCheck()
 }
 
 dim3
-hypre_GetDefaultCUDABlockDimension()
+hypre_GetDefaultDeviceBlockDimension()
 {
    dim3 bDim(512, 1, 1);
 
@@ -118,7 +117,7 @@ hypre_GetDefaultCUDABlockDimension()
 }
 
 dim3
-hypre_GetDefaultCUDAGridDimension( HYPRE_Int n,
+hypre_GetDefaultDeviceGridDimension( HYPRE_Int n,
                                    const char *granularity,
                                    dim3 bDim )
 {
@@ -182,8 +181,8 @@ HYPRE_Int
 hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, HYPRE_Int *d_offd_ia,
                       HYPRE_Int *d_rownnz)
 {
-   const dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   const dim3 gDim = hypre_GetDefaultCUDAGridDimension(nrows, "thread", bDim);
+   const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   const dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "thread", bDim);
 
    /* trivial case */
    if (nrows <= 0)
@@ -321,8 +320,8 @@ hypreDevice_CopyParCSRRows(HYPRE_Int      nrows,
 
    hypre_assert(!(nrows > 1 && d_ib == NULL));
 
-   const dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   const dim3 gDim = hypre_GetDefaultCUDAGridDimension(nrows, "warp", bDim);
+   const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   const dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
    /*
    if (job == 2)
@@ -570,8 +569,8 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea
 
       hypre_assert(reduced_n == new_end.second - reduced_y);
 
-      dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-      dim3 gDim = hypre_GetDefaultCUDAGridDimension(reduced_n, "thread", bDim);
+      dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+      dim3 gDim = hypre_GetDefaultDeviceGridDimension(reduced_n, "thread", bDim);
 
       HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim,
                          reduced_n, x, reduced_map, reduced_y );
@@ -613,8 +612,8 @@ hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v)
       return hypre_error_flag;
    }
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v );
 
@@ -645,8 +644,8 @@ hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Comple
       return hypre_error_flag;
    }
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y );
 
@@ -677,8 +676,8 @@ hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_
       return hypre_error_flag;
    }
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
 
@@ -714,8 +713,8 @@ hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
       return hypre_error_flag;
    }
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y );
 
@@ -747,8 +746,8 @@ hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
       return hypre_error_flag;
    }
 
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(n, "thread", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z );
 
@@ -771,8 +770,8 @@ hypreCUDAKernel_BigToSmallCopy(      HYPRE_Int*    __restrict__ tgt,
 HYPRE_Int
 hypreDevice_BigToSmallCopy(HYPRE_Int *tgt, const HYPRE_BigInt *src, HYPRE_Int size)
 {
-   dim3 bDim = hypre_GetDefaultCUDABlockDimension();
-   dim3 gDim = hypre_GetDefaultCUDAGridDimension(size, "thread", bDim);
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(size, "thread", bDim);
 
    HYPRE_CUDA_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size);
 
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index 7123aefaaf..9127222d8b 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -466,9 +466,9 @@ hypre_int hypre_cuda_get_grid_warp_id(sycl::nd_item<dim>& item)
 }
 
 /* device_utils.c */
-sycl::range<1> hypre_GetDefaultCUDABlockDimension();
+sycl::range<1> hypre_GetDefaultDeviceBlockDimension();
 
-sycl::range<1> hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, sycl::range<1> bDim );
+sycl::range<1> hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, sycl::range<1> bDim );
 
 #endif // #if defined(HYPRE_USING_SYCL)
 
@@ -1049,9 +1049,9 @@ struct print_functor
 };
 
 /* device_utils.c */
-dim3 hypre_GetDefaultCUDABlockDimension();
+dim3 hypre_GetDefaultDeviceBlockDimension();
 
-dim3 hypre_GetDefaultCUDAGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim );
+dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim );
 
 template <typename T1, typename T2, typename T3> HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *vals, HYPRE_Int opt);
 

From f48eec0cdc6baefa7cb9ce60fc9fadaa54cf26cf Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Fri, 29 Oct 2021 21:25:56 +0000
Subject: [PATCH 26/44] Try different formulation of reduction

Uses shared memory pointer instead of buffers and accessors.
Seems to work on iris, same error as before on arcticus.
---
 src/struct_mv/_hypre_struct_mv.hpp | 19 +++++++++++--------
 src/struct_mv/boxloop_sycl.h       | 19 +++++++++++--------
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index 39a897a9a9..ee421fb85d 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1224,7 +1224,7 @@ template<typename LOOP_BODY>
 void
 ReductionBoxLoopforall( LOOP_BODY  loop_body,
                         HYPRE_Int length,
-                        sycl::buffer<HYPRE_Real> sum_buf )
+                        HYPRE_Real *hypre_sycl_sum )
 {
    if (length <= 0)
    {
@@ -1252,8 +1252,7 @@ ReductionBoxLoopforall( LOOP_BODY  loop_body,
 
       hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
          {
-            sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write);
-            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ext::oneapi::reduction(sum_acc, std::plus<>()), loop_body);
+            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::reduction(hypre_sycl_sum, std::plus<>()), loop_body);
          }).wait_and_throw();
    }
 }
@@ -1506,7 +1505,8 @@ else                                                            \
 {                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                                     \
+   HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);                    \
+   hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);    \
    ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                     \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
@@ -1517,7 +1517,8 @@ else                                                            \
 
 #define hypre_newBoxLoop1ReductionEnd(i1, sum_var)                                                    \
       }                                                                                               \
-   }, hypre__tot, sum_buf);                                                                           \
+   }, hypre__tot, hypre_sycl_sum);                                                                    \
+   hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);    \
 }
 
 /* Reduction BoxLoop2 */
@@ -1529,7 +1530,8 @@ else                                                            \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
    hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
-   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                                     \
+   HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);                    \
+   hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);    \
    ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                     \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
@@ -1541,7 +1543,8 @@ else                                                            \
 
 #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var)                                                \
       }                                                                                               \
-   }, hypre__tot, sum_buf);                                                                           \
+   }, hypre__tot, hypre_sycl_sum);                                                                    \
+   hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);    \
 }
 
 /* Plain parallel_for loop */
@@ -1556,7 +1559,7 @@ else                                                            \
 
 #define hypre_LoopEnd()                                                                               \
       }                                                                                               \
-   });                                                                                          \
+   });                                                                                                \
 }
 
 
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index 1527f3f1b1..af8f1d9f9d 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -72,7 +72,7 @@ template<typename LOOP_BODY>
 void
 ReductionBoxLoopforall( LOOP_BODY  loop_body,
                         HYPRE_Int length,
-                        sycl::buffer<HYPRE_Real> sum_buf )
+                        HYPRE_Real *hypre_sycl_sum )
 {
    if (length <= 0)
    {
@@ -100,8 +100,7 @@ ReductionBoxLoopforall( LOOP_BODY  loop_body,
 
       hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
          {
-            sycl::accessor sum_acc(sum_buf, cgh, sycl::read_write);
-            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::ext::oneapi::reduction(sum_acc, std::plus<>()), loop_body);
+            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::reduction(hypre_sycl_sum, std::plus<>()), loop_body);
          }).wait_and_throw();
    }
 }
@@ -354,7 +353,8 @@ else                                                            \
 {                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                                     \
+   HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);                    \
+   hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);    \
    ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                     \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
@@ -365,7 +365,8 @@ else                                                            \
 
 #define hypre_newBoxLoop1ReductionEnd(i1, sum_var)                                                    \
       }                                                                                               \
-   }, hypre__tot, sum_buf);                                                                           \
+   }, hypre__tot, hypre_sycl_sum);                                                                    \
+   hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);    \
 }
 
 /* Reduction BoxLoop2 */
@@ -377,7 +378,8 @@ else                                                            \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
    hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
-   sycl::buffer<HYPRE_Real> sum_buf(&sum_var, 1);                                                     \
+   HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);                    \
+   hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);    \
    ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                     \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
@@ -389,7 +391,8 @@ else                                                            \
 
 #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var)                                                \
       }                                                                                               \
-   }, hypre__tot, sum_buf);                                                                           \
+   }, hypre__tot, hypre_sycl_sum);                                                                    \
+   hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);    \
 }
 
 /* Plain parallel_for loop */
@@ -404,7 +407,7 @@ else                                                            \
 
 #define hypre_LoopEnd()                                                                               \
       }                                                                                               \
-   });                                                                                          \
+   });                                                                                                \
 }
 
 

From 980bee52687b189673e33224db214b066f117c21 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Tue, 2 Nov 2021 18:16:49 +0000
Subject: [PATCH 27/44] Autoconf clean up

---
 src/config/HYPRE_config.h.in |  15 +++-
 src/config/configure.in      |  74 +++++-------------
 src/configure                | 143 +++++++++++++++--------------------
 src/test/Makefile            |   8 +-
 4 files changed, 96 insertions(+), 144 deletions(-)

diff --git a/src/config/HYPRE_config.h.in b/src/config/HYPRE_config.h.in
index fd9398adfa..e9c034d3fd 100644
--- a/src/config/HYPRE_config.h.in
+++ b/src/config/HYPRE_config.h.in
@@ -196,9 +196,6 @@
 /* HIP being used */
 #undef HYPRE_USING_HIP
 
-/* SYCL being used */
-#undef HYPRE_USING_SYCL
-
 /* Define to 1 if using host memory only */
 #undef HYPRE_USING_HOST_MEMORY
 
@@ -220,6 +217,15 @@
 /* NVTX being used */
 #undef HYPRE_USING_NVTX
 
+/* onemkl::BLAS being used */
+#undef HYPRE_USING_ONEMKLBLAS
+
+/* onemkl::rng being used */
+#undef HYPRE_USING_ONEMKLRAND
+
+/* onemkl::SPARSE being used */
+#undef HYPRE_USING_ONEMKLSPARSE
+
 /* Enable OpenMP support */
 #undef HYPRE_USING_OPENMP
 
@@ -241,6 +247,9 @@
 /* Define to 1 if using AMD rocTX profiling */
 #undef HYPRE_USING_ROCTX
 
+/* SYCL being used */
+#undef HYPRE_USING_SYCL
+
 /* Define to 1 if using UMPIRE */
 #undef HYPRE_USING_UMPIRE
 
diff --git a/src/config/configure.in b/src/config/configure.in
index 4f282ead8a..3c1f0b16fe 100644
--- a/src/config/configure.in
+++ b/src/config/configure.in
@@ -204,8 +204,6 @@ hypre_using_onemklsparse=no
 hypre_using_onemklblas=no
 hypre_using_onemklrand=no
 
-hypre_found_sycl=no
-
 
 dnl *********************************************************************
 dnl * Initialize flag-check variables
@@ -1568,6 +1566,11 @@ then
    then
       AC_CHECK_PROGS(CUCC, hipcc)
    fi
+
+   if test "$hypre_using_sycl" = "yes"
+   then
+      AC_CHECK_PROGS(CUCC, dpcpp)
+   fi
 fi
 
 dnl *********************************************************************
@@ -1976,9 +1979,9 @@ fi
 
 if [test "x$hypre_using_um" = "xyes"]
 then
-   if [test "x$hypre_using_cuda" != "xyes" && test "x$hypre_using_device_openmp" != "xyes" && test "x$hypre_using_hip" != "xyes"]
+   if [test "x$hypre_using_cuda" != "xyes" && test "x$hypre_using_device_openmp" != "xyes" && test "x$hypre_using_hip" != "xyes" && test "x$hypre_using_sycl" != "xyes"]
    then
-      AC_MSG_ERROR([Asked for unified memory, but not using CUDA, HIP, or device OpenMP!])
+      AC_MSG_ERROR([Asked for unified memory, but not using CUDA, HIP, SYCL, or device OpenMP!])
    fi
 fi dnl hypre_using_um
 
@@ -2019,27 +2022,6 @@ AS_IF([ test x"$hypre_using_hip" == x"yes" ],
                           [AC_MSG_ERROR([unable to find ${HYPRE_ROCM_PREFIX}/include/hip/hip_common.h ... Ensure ROCm is installed and set ROCM_PATH environment variable to ROCm installation path.])] )],
       [])
 
-dnl *********************************************************************
-dnl * Check for SYCL header
-dnl *********************************************************************
-
-dnl If the user has requested to use SYCL, we first check the environment
-dnl for ONEAPI_PATH to point at the oneAPI installation. If that is not found,
-dnl then we default to `/opt/intel/oneapi`.
-dnl
-dnl TODO: Add an ARG_WITH for sycl so the user can control the oneAPI path
-dnl       through the configure line
-AS_IF([ test x"$hypre_using_sycl" == x"yes" ],
-      [ AS_IF([ test -n "$ONEAPI_PATH"],
-              [ HYPRE_SYCL_PREFIX=$ONEAPI_PATH ],
-              [ HYPRE_SYCL_PREFIX=/opt/intel/oneapi ])
-      
-      AC_SUBST(HYPRE_SYCL_PREFIX)
-      AC_CHECK_HEADERS( ["${HYPRE_SYCL_PREFIX}/compiler/latest/linux/include/sycl/CL/sycl.hpp"],
-                        [hypre_found_sycl=yes],
-                        [AC_MSG_ERROR([unable to find ${HYPRE_SYCL_PREFIX}/compiler/latest/linux/include/sycl/CL/sycl.hpp ... Ensure oneAPI SDK is installed and set ONEAPI_PATH environment variable to oneAPI installation path.])] )],
-      [])
-
 dnl *********************************************************************
 dnl * Set raja options
 dnl *********************************************************************
@@ -2317,43 +2299,22 @@ AS_IF([test x"$hypre_using_hip" == x"yes"],
 dnl *********************************************************************
 dnl * Set SYCL options
 dnl *********************************************************************
-AS_IF([test x"$hypre_user_chose_sycl" == x"yes"],
+AS_IF([test x"$hypre_using_sycl" == x"yes"],
       [
         AC_DEFINE(HYPRE_USING_GPU,  1, [Define to 1 if executing on GPU device])
         AC_DEFINE(HYPRE_USING_SYCL, 1, [SYCL being used])
 
-        dnl The actual invocation of the clang compiler from oneAPI that
-        dnl supports SYCL and all the command line foo needed by the compiler.
-        AC_CHECK_PROGS(CXX, [dpcpp])
+	     LINK_CC=${CUCC}
+        LINK_CXX=${CUCC}
 
-        dnl (Ab)Using dpcpp when compiling SYCL
-	     LINK_CC=${CXX}
-        LINK_CXX=${CXX}
-
-        dnl The "-x sycl" is necessary to override the detection of .c files which clang
-        dnl interprets as C and therefore invokes the C compiler rather than the SYCL part
-        dnl of clang. Put SYCLCXXFLAGS at the end so the user can override from
-        dnl from the configure line.
-        SYCLCXXFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel "
+        CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel "
 
         dnl If not in debug mode, at least -O2, but the user can override with
-        dnl with SYCLCXXFLAGS on the configure line. If in debug mode, -O0 -Wall
+        dnl with SYCLFLAGS on the configure line. If in debug mode, -O0 -Wall
         dnl plus flags for debugging symbols
         AS_IF([test x"$hypre_using_debug" == x"yes"],
-              [SYCLCXXFLAGS="-O0 -Wall -g -gdb ${SYCLCXXFLAGS}"],
-              [SYCLCXXFLAGS="-O2 ${SYCLCXXFLAGS}"],)
-
-        dnl (Ab)Use CXXFLAGS to capture SYCL compilation flags
-        dnl Put SYCLCXXFLAGS at the end so the user can override the optimization level.
-        CXXFLAGS="${SYCLCXXFLAGS}"
-
-        dnl dpl, dpct so we need both for Thrust on Intel GPUs.
-        dnl These are header-only so no linking needed.
-        HYPRE_SYCL_INCL="-I${ONEAPI_PATH}/dpl/latest/linux/include"
-        HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${ONEAPI_PATH}/dpcpp-ct/latest/include"
-
-        dnl SYCL library
-        HYPRE_SYCL_LIBS="-L${HYPRE_SYCL_PREFIX}/lib -lamdsycl64"
+              [SYCLFLAGS="-O0 -Wall -g ${SYCLFLAGS}"],
+              [SYCLFLAGS="-O2 ${SYCLFLAGS}"],)
 
         AS_IF([test x"$hypre_using_onemklsparse" == x"yes"],
               [AC_DEFINE(HYPRE_USING_ONEMKLSPARSE, 1, [onemkl::SPARSE being used])
@@ -2367,14 +2328,13 @@ AS_IF([test x"$hypre_user_chose_sycl" == x"yes"],
                HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/blas.hpp"
                ])
 
-        dnl onemklrand: random number generation on Intel GPUs
         AS_IF([test x"$hypre_using_onemklrand" == x"yes"],
               [AC_DEFINE(HYPRE_USING_ONEMKLRAND, 1, [onemkl::rng being used])
                HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
                HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/rng.hpp"
                ])
 
-      ]) dnl AS_IF([test x"$hypre_user_chose_sycl" == x"yes"]
+      ]) dnl AS_IF([test x"$hypre_using_sycl" == x"yes"]
 
 
 dnl *********************************************************************
@@ -2399,7 +2359,7 @@ then
       AC_MSG_NOTICE([Use --enable-unified-memory to compile with unified memory.])
       AC_MSG_NOTICE([***********************************************************************])
    fi
-   if test "$hypre_user_chose_sycl" = "yes"
+   if test "$hypre_using_sycl" = "yes"
    then
       AC_MSG_NOTICE([***********************************************************])
       AC_MSG_NOTICE([Configuring with --with-sycl=yes without unified memory.])
@@ -2494,7 +2454,7 @@ if test "x$hypre_using_um" = "xyes"
 then
    AC_DEFINE([HYPRE_USING_UNIFIED_MEMORY],1,[Define to 1 if using unified memory])
 else
-   if [test "x$hypre_using_cuda" = "xyes" || test "x$hypre_using_device_openmp" = "xyes" || test "x$hypre_using_hip" = "xyes"]
+   if [test "x$hypre_using_cuda" = "xyes" || test "x$hypre_using_device_openmp" = "xyes" || test "x$hypre_using_hip" = "xyes" || test "x$hypre_using_sycl" = "xyes"]
    then
       AC_DEFINE([HYPRE_USING_DEVICE_MEMORY],1,[Define to 1 if using device memory without UM])
    else
diff --git a/src/configure b/src/configure
index c7f941f40e..eef3052752 100755
--- a/src/configure
+++ b/src/configure
@@ -2809,8 +2809,6 @@ hypre_using_onemklsparse=no
 hypre_using_onemklblas=no
 hypre_using_onemklrand=no
 
-hypre_found_sycl=no
-
 
 hypre_blas_lib_old_style=no
 hypre_blas_lib_dir_old_style=no
@@ -4014,13 +4012,12 @@ fi
 
 
 
-
 # Check whether --with-sycl was given.
 if test "${with_sycl+set}" = set; then :
   withval=$with_sycl; case "$withval" in
-yes) hypre_using_sycl=yes ;;
-no)  hypre_using_sycl=no ;;
-*)   hypre_using_sycl=no ;;
+    yes) hypre_using_sycl=yes ;;
+    no)  hypre_using_sycl=no ;;
+    *)   hypre_using_sycl=no ;;
 esac
 else
   hypre_using_sycl=no
@@ -4910,21 +4907,6 @@ then
    as_fn_error $? "--with-hip and --with-device-openmp are mutually exclusive" "$LINENO" 5
 fi
 
-if test "x$hypre_using_cuda" = "xyes" && test "x$hypre_using_sycl" = "xyes"
-then
-   as_fn_error $? "--with-cuda and --with-sycl are mutually exclusive" "$LINENO" 5
-fi
-
-if test "x$hypre_using_hip" = "xyes" && test "x$hypre_using_sycl" = "xyes"
-then
-   as_fn_error $? "--with-hip and --with-sycl are mutually exclusive" "$LINENO" 5
-fi
-
-if test "x$hypre_using_device_openmp" = "xyes" && test "x$hypre_using_sycl" = "xyes"
-then
-   as_fn_error $? "--with-device-openmp and --with-sycl are mutually exclusive" "$LINENO" 5
-fi
-
 
 if test "$hypre_user_chose_cudacompilers" = "no"
 then
@@ -5097,6 +5079,52 @@ done
   done
 IFS=$as_save_IFS
 
+fi
+fi
+CUCC=$ac_cv_prog_CUCC
+if test -n "$CUCC"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUCC" >&5
+$as_echo "$CUCC" >&6; }
+else
+  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+$as_echo "no" >&6; }
+fi
+
+
+  test -n "$CUCC" && break
+done
+
+   fi
+
+   if test "$hypre_using_sycl" = "yes"
+   then
+      for ac_prog in dpcpp
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+$as_echo_n "checking for $ac_word... " >&6; }
+if ${ac_cv_prog_CUCC+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if test -n "$CUCC"; then
+  ac_cv_prog_CUCC="$CUCC" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  test -z "$as_dir" && as_dir=.
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CUCC="$ac_prog"
+    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
 fi
 fi
 CUCC=$ac_cv_prog_CUCC
@@ -8626,7 +8654,7 @@ if test "x$hypre_using_um" = "xyes"
 then
    if test "x$hypre_using_cuda" != "xyes" && test "x$hypre_using_device_openmp" != "xyes" && test "x$hypre_using_hip" != "xyes" && test "x$hypre_using_sycl" != "xyes"
    then
-      as_fn_error $? "Asked for unified memory, but not using CUDA, HIP, or device OpenMP!" "$LINENO" 5
+      as_fn_error $? "Asked for unified memory, but not using CUDA, HIP, SYCL, or device OpenMP!" "$LINENO" 5
    fi
 fi
 if test "$hypre_using_cuda" = "yes" || test "$hypre_using_device_openmp" = "yes"
@@ -8831,8 +8859,6 @@ done
 
 fi
 
-
-
 if test "x$hypre_using_raja" = "xyes"
 then
 
@@ -9093,74 +9119,26 @@ fi
 
 fi
 
-
 if test x"$hypre_using_sycl" == x"yes"; then :
 
-$as_echo "#define HYPRE_USING_GPU 1" >>confdefs.h
-
-$as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h
-
 
-        	        for ac_prog in dpcpp
-do
-  # Extract the first word of "$ac_prog", so it can be a program name with args.
-set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
-$as_echo_n "checking for $ac_word... " >&6; }
-if ${ac_cv_prog_CUCC+:} false; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test -n "$CUCC"; then
-  ac_cv_prog_CUCC="$CUCC" # Let the user override the test.
-else
-as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-for as_dir in $PATH
-do
-  IFS=$as_save_IFS
-  test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
-  if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then
-    ac_cv_prog_CUCC="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
-    break 2
-  fi
-done
-  done
-IFS=$as_save_IFS
-
-fi
-fi
-CUCC=$ac_cv_prog_CUCC
-if test -n "$CUCC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CUCC" >&5
-$as_echo "$CUCC" >&6; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-fi
+$as_echo "#define HYPRE_USING_GPU 1" >>confdefs.h
 
 
-  test -n "$CUCC" && break
-done
+$as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h
 
 
-        	LINK_CC=${CUCC}
+	     LINK_CC=${CUCC}
         LINK_CXX=${CUCC}
 
-                                        SYCLCXXFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel "
+        CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel "
 
                                 if test x"$hypre_using_debug" == x"yes"; then :
-  SYCLCXXFLAGS="-O0 -Wall -g ${SYCLCXXFLAGS}"
-elif SYCLCXXFLAGS="-O2 ${SYCLCXXFLAGS}"; then :
+  SYCLFLAGS="-O0 -Wall -g ${SYCLFLAGS}"
+elif SYCLFLAGS="-O2 ${SYCLFLAGS}"; then :
 
 fi
 
-                        CUFLAGS="${SYCLCXXFLAGS}"
-
-                        HYPRE_SYCL_INCL="-I${ONEAPI_PATH}/dpl/latest/linux/include"
-        HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${ONEAPI_PATH}/dpcpp-ct/latest/include"
-
-
         if test x"$hypre_using_onemklsparse" == x"yes"; then :
 
 $as_echo "#define HYPRE_USING_ONEMKLSPARSE 1" >>confdefs.h
@@ -9174,12 +9152,12 @@ fi
 
 $as_echo "#define HYPRE_USING_ONEMKLBLAS 1" >>confdefs.h
 
-	       HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
+	            HYPRE_SYCL_LIBS="${HYPRE_SYCL_LIBS} -lmkl_sycl"
                HYPRE_SYCL_INCL="${HYPRE_SYCL_INCL} -I${MKLROOT}/include/oneapi/mkl/blas.hpp"
 
 fi
 
-                if test x"$hypre_using_onemklrand" == x"yes"; then :
+        if test x"$hypre_using_onemklrand" == x"yes"; then :
 
 $as_echo "#define HYPRE_USING_ONEMKLRAND 1" >>confdefs.h
 
@@ -9191,8 +9169,6 @@ fi
 
 fi
 
-
-
 if test "$hypre_using_um" != "yes"
 then
       if test "$hypre_using_cuda" = "yes"
@@ -9562,6 +9538,9 @@ $as_echo "#define HYPRE_LINUX 1" >>confdefs.h
 
 
 
+
+
+
 
 
 
diff --git a/src/test/Makefile b/src/test/Makefile
index 8f5cedba35..42fcddfd93 100644
--- a/src/test/Makefile
+++ b/src/test/Makefile
@@ -40,8 +40,12 @@ F77_COMPILE_FLAGS = \
 MPILIBFLAGS = ${MPILIBDIRS} ${MPILIBS} ${MPIFLAGS}
 LAPACKLIBFLAGS = ${LAPACKLIBDIRS} ${LAPACKLIBS}
 BLASLIBFLAGS = ${BLASLIBDIRS} ${BLASLIBS}
-# WM: had to add the absolute path to libHYPRE.a for successful compilation on frank
-LIBFLAGS = ${LDFLAGS} ${LIBS} ${HYPRE_BUILD_DIR}/lib/libHYPRE.a
+# WM: currently have to add the absolute path to libHYPRE.a when building sycl code
+ifeq ($(notdir $(firstword ${LINK_CC})), dpcpp)
+	LIBFLAGS = ${LDFLAGS} ${LIBS} ${HYPRE_BUILD_DIR}/lib/libHYPRE.a
+else
+	LIBFLAGS = ${LDFLAGS} ${LIBS}
+endif
 
 ifeq ($(notdir $(firstword ${LINK_CC})), nvcc)
    XLINK = -Xlinker=-rpath,${HYPRE_BUILD_DIR}/lib

From 2d5ee90db38b2b5efea2a809bb636f3ba760d4f4 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Tue, 2 Nov 2021 20:10:30 +0000
Subject: [PATCH 28/44] Cleanup boxloops, renamings, make sure tests compile

---
 src/seq_mv/csr_matop_device.c      |   2 +
 src/seq_mv/csr_sptrans_device.c    |  24 +++++++
 src/struct_ls/pfmg_setup.c         |  56 ++++++++--------
 src/struct_mv/_hypre_struct_mv.hpp | 102 ++++++++++++-----------------
 src/struct_mv/boxloop_sycl.h       | 102 ++++++++++++-----------------
 src/struct_mv/struct_innerprod.c   |   2 +-
 6 files changed, 141 insertions(+), 147 deletions(-)

diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c
index 9b428d0553..2734b7aa9b 100644
--- a/src/seq_mv/csr_matop_device.c
+++ b/src/seq_mv/csr_matop_device.c
@@ -1496,6 +1496,8 @@ hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix  *A,
       hypreDevice_CSRSpTransCusparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data);
 #elif defined(HYPRE_USING_ROCSPARSE)
       hypreDevice_CSRSpTransRocsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data);
+#elif defined(HYPRE_USING_ONEMKLSPARSE)
+      hypreDevice_CSRSpTransOnemklsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data);
 #else
       hypreDevice_CSRSpTrans(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data);
 #endif
diff --git a/src/seq_mv/csr_sptrans_device.c b/src/seq_mv/csr_sptrans_device.c
index d41d38af04..440cce76ec 100644
--- a/src/seq_mv/csr_sptrans_device.c
+++ b/src/seq_mv/csr_sptrans_device.c
@@ -146,6 +146,18 @@ hypreDevice_CSRSpTransRocsparse(HYPRE_Int   m,        HYPRE_Int   n,        HYPR
 
 #endif // #if defined(HYPRE_USING_ROCSPARSE)
 
+#if defined(HYPRE_USING_ONEMKLSPARSE)
+HYPRE_Int
+hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int   m,        HYPRE_Int   n,        HYPRE_Int       nnzA,
+                                   HYPRE_Int  *d_ia,     HYPRE_Int  *d_ja,     HYPRE_Complex  *d_aa,
+                                   HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out,
+                                   HYPRE_Int   want_data)
+{
+/* WM: TODO */
+   return hypre_error_flag;
+}
+#endif // #if defined(HYPRE_USING_ONEMKLSPARSE)
+
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
@@ -222,3 +234,15 @@ hypreDevice_CSRSpTrans(HYPRE_Int   m,        HYPRE_Int   n,        HYPRE_Int
 }
 
 #endif /* HYPRE_USING_CUDA  || defined(HYPRE_USING_HIP) */
+
+#if defined(HYPRE_USING_SYCL)
+HYPRE_Int
+hypreDevice_CSRSpTrans(HYPRE_Int   m,        HYPRE_Int   n,        HYPRE_Int       nnzA,
+                       HYPRE_Int  *d_ia,     HYPRE_Int  *d_ja,     HYPRE_Complex  *d_aa,
+                       HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out,
+                       HYPRE_Int   want_data)
+{
+/* WM: TODO */
+   return hypre_error_flag;
+}
+#endif // #if defined(HYPRE_USING_SYCL)
diff --git a/src/struct_ls/pfmg_setup.c b/src/struct_ls/pfmg_setup.c
index 684824f26a..08129ac913 100644
--- a/src/struct_ls/pfmg_setup.c
+++ b/src/struct_ls/pfmg_setup.c
@@ -1061,7 +1061,7 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcx;
+      hypre_sycl_sum += tcx;
 #else
       cxb += tcx;
 #endif
@@ -1075,7 +1075,7 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cn[Ai] + a_cs[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcy;
+      hypre_sycl_sum += tcy;
 #else
       cyb += tcy;
 #endif
@@ -1089,7 +1089,7 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcx * tcx;
+      hypre_sycl_sum += tcx * tcx;
 #else
       sqcxb += tcx * tcx;
 #endif
@@ -1103,7 +1103,7 @@ hypre_PFMGComputeDxyz_SS5( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cn[Ai] + a_cs[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcy * tcy;
+      hypre_sycl_sum += tcy * tcy;
 #else
       sqcyb += tcy * tcy;
 #endif
@@ -1266,7 +1266,7 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcx;
+      hypre_sycl_sum += tcx;
 #else
       cxb += tcx;
 #endif
@@ -1280,7 +1280,7 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcy;
+      hypre_sycl_sum += tcy;
 #else
       cyb += tcy;
 #endif
@@ -1294,7 +1294,7 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcx*tcx;
+      hypre_sycl_sum += tcx*tcx;
 #else
       sqcxb += tcx*tcx;
 #endif
@@ -1308,7 +1308,7 @@ hypre_PFMGComputeDxyz_SS9( HYPRE_Int bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcy*tcy;
+      hypre_sycl_sum += tcy*tcy;
 #else
       sqcyb += tcy*tcy;
 #endif
@@ -1457,7 +1457,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcx;
+      hypre_sycl_sum += tcx;
 #else
       cxb += tcx;
 #endif
@@ -1471,7 +1471,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcy;
+      hypre_sycl_sum += tcy;
 #else
       cyb += tcy;
 #endif
@@ -1485,7 +1485,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcz;
+      hypre_sycl_sum += tcz;
 #else
       czb += tcz;
 #endif
@@ -1499,7 +1499,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcx*tcx;
+      hypre_sycl_sum += tcx*tcx;
 #else
       sqcxb += tcx*tcx;
 #endif
@@ -1513,7 +1513,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcy*tcy;
+      hypre_sycl_sum += tcy*tcy;
 #else
       sqcyb += tcy*tcy;
 #endif
@@ -1527,7 +1527,7 @@ hypre_PFMGComputeDxyz_SS7( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcz*tcz;
+      hypre_sycl_sum += tcz*tcz;
 #else
       sqczb += tcz*tcz;
 #endif
@@ -1736,7 +1736,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
        HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
        HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_aw[Ai] + a_ae[Ai] + a_bw[Ai] + a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-       sum += tcx;
+       hypre_sycl_sum += tcx;
 #else
        cxb += tcx;
 #endif
@@ -1750,7 +1750,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcy;
+      hypre_sycl_sum += tcy;
 #else
       cyb += tcy;
 #endif
@@ -1764,7 +1764,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] +  a_bw[Ai]  + a_be[Ai] +  a_bn[Ai] +  a_bs[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcz;
+      hypre_sycl_sum += tcz;
 #else
       czb += tcz;
 #endif
@@ -1778,7 +1778,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcx = -diag * (a_cw[Ai] + a_ce[Ai] + a_aw[Ai] + a_ae[Ai] + a_bw[Ai] + a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcx*tcx;
+      hypre_sycl_sum += tcx*tcx;
 #else
       sqcxb += tcx*tcx;
 #endif
@@ -1792,7 +1792,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcy*tcy;
+      hypre_sycl_sum += tcy*tcy;
 #else
       sqcyb += tcy*tcy;
 #endif
@@ -1806,7 +1806,7 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
       HYPRE_Real diag = a_cc[Ai] < 0.0 ? -1.0 : 1.0;
       HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] +  a_bw[Ai]  + a_be[Ai] +  a_bn[Ai] +  a_bs[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcz*tcz;
+      hypre_sycl_sum += tcz*tcz;
 #else
       sqczb += tcz*tcz;
 #endif
@@ -2058,7 +2058,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       tcx -= diag * (a_cw[Ai]  + a_ce[Ai]  +  a_aw[Ai] +  a_ae[Ai] +  a_bw[Ai] +  a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
       tcx -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcx;
+      hypre_sycl_sum += tcx;
 #else
       cxb += tcx;
 #endif
@@ -2074,7 +2074,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       tcy -= diag * (a_cs[Ai]  + a_cn[Ai]  +  a_an[Ai] +  a_as[Ai] +  a_bn[Ai] +  a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
       tcy -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcy;
+      hypre_sycl_sum += tcy;
 #else
       cyb += tcy;
 #endif
@@ -2090,7 +2090,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       tcz -= diag * (a_ac[Ai]  +  a_bc[Ai] +  a_aw[Ai] +  a_ae[Ai] +  a_an[Ai] +  a_as[Ai] +  a_bw[Ai] +  a_be[Ai] + a_bn[Ai] + a_bs[Ai]);
       tcz -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcz;
+      hypre_sycl_sum += tcz;
 #else
       czb += tcz;
 #endif
@@ -2106,7 +2106,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       tcx -= diag * (a_cw[Ai]  + a_ce[Ai]  +  a_aw[Ai] +  a_ae[Ai] +  a_bw[Ai] +  a_be[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
       tcx -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcx*tcx;
+      hypre_sycl_sum += tcx*tcx;
 #else
       sqcxb += tcx*tcx;
 #endif
@@ -2122,7 +2122,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       tcy -= diag * (a_cs[Ai]  + a_cn[Ai]  +  a_an[Ai] +  a_as[Ai] +  a_bn[Ai] +  a_bs[Ai] + a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
       tcy -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcy*tcy;
+      hypre_sycl_sum += tcy*tcy;
 #else
       sqcyb += tcy*tcy;
 #endif
@@ -2138,7 +2138,7 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       tcz -= diag * (a_ac[Ai]  +  a_bc[Ai] +  a_aw[Ai] +  a_ae[Ai] +  a_an[Ai] +  a_as[Ai] +  a_bw[Ai] +  a_be[Ai] + a_bn[Ai] + a_bs[Ai]);
       tcz -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] + a_bne[Ai]);
 #if defined(HYPRE_USING_SYCL)
-      sum += tcz*tcz;
+      hypre_sycl_sum += tcz*tcz;
 #else
       sqczb += tcz*tcz;
 #endif
@@ -2310,7 +2310,7 @@ hypre_ZeroDiagonal( hypre_StructMatrix *A )
             if (Ap[Ai] == 0.0)
             {
 #if defined(HYPRE_USING_SYCL)
-               sum += one;
+               hypre_sycl_sum += one;
 #else
                diag_product_local += one;
 #endif
@@ -2318,7 +2318,7 @@ hypre_ZeroDiagonal( hypre_StructMatrix *A )
             else
             {
 #if defined(HYPRE_USING_SYCL)
-               sum += zero;
+               hypre_sycl_sum += zero;
 #else
                diag_product_local += zero;
 #endif
diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index ee421fb85d..d76cab3557 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1192,69 +1192,36 @@ void
 BoxLoopforall( HYPRE_Int length,
                LOOP_BODY loop_body)
 {
-   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
-   /* WM: TODO: uncomment above and remove below */
-   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
-
-   if (exec_policy == HYPRE_EXEC_HOST)
-   {
-/* WM: todo - is this really necessary, even? */
-/* #ifdef HYPRE_USING_OPENMP */
-/* #pragma omp parallel for HYPRE_SMP_SCHEDULE */
-/* #endif */
-/*       for (HYPRE_Int idx = 0; idx < length; idx++) */
-/*       { */
-/*          loop_body(idx); */
-/*       } */
-   }
-   else if (exec_policy == HYPRE_EXEC_DEVICE)
+   if (length <= 0)
    {
-      /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */
-      const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
-      const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
-
-      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
-         {
-            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body);
-         }).wait_and_throw();
+      return;
    }
+   const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
+   const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
+
+   hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
+      {
+         cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body);
+      }).wait_and_throw();
 }
 
 template<typename LOOP_BODY>
 void
 ReductionBoxLoopforall( LOOP_BODY  loop_body,
                         HYPRE_Int length,
-                        HYPRE_Real *hypre_sycl_sum )
+                        HYPRE_Real *shared_sum_var )
 {
    if (length <= 0)
    {
       return;
    }
+   const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
+   const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
-   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
-   /* WM: TODO: uncomment above and remove below */
-   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
-
-   if (exec_policy == HYPRE_EXEC_HOST)
-   {
-      /* WM: todo - is this really necessary, even? */
-      /* for (HYPRE_Int idx = 0; idx < length; idx++) */
-      /* { */
-      /*    loop_body(idx, reducer); */
-      /* } */
-   }
-   else if (exec_policy == HYPRE_EXEC_DEVICE)
-   {
-      /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */
-      /* NOTE: in the cuda version, there is further manipulation of bDim and gDim that I don't include here */
-      const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
-      const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
-
-      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
-         {
-            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::reduction(hypre_sycl_sum, std::plus<>()), loop_body);
-         }).wait_and_throw();
-   }
+   hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
+      {
+         cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::reduction(shared_sum_var, std::plus<>()), loop_body);
+      }).wait_and_throw();
 }
 
 #ifdef __cplusplus
@@ -1377,6 +1344,21 @@ else                                                            \
  * Boxloops
  *********************************************************************/
 
+/* BoxLoop 0 */
+#define hypre_newBoxLoop0Begin(ndim, loop_size)                                                       \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+
+#define hypre_newBoxLoop0End()                                                                        \
+      }                                                                                               \
+   });                                                                                                \
+}
+
 /* BoxLoop 1 */
 #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
 {                                                                                                     \
@@ -1505,9 +1487,9 @@ else                                                            \
 {                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);                    \
-   hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);    \
-   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                     \
+   HYPRE_Real *shared_sum_var = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);                    \
+   hypre_TMemcpy(shared_sum_var, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);    \
+   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &hypre_sycl_sum)                          \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -1517,8 +1499,8 @@ else                                                            \
 
 #define hypre_newBoxLoop1ReductionEnd(i1, sum_var)                                                    \
       }                                                                                               \
-   }, hypre__tot, hypre_sycl_sum);                                                                    \
-   hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);    \
+   }, hypre__tot, shared_sum_var);                                                                    \
+   hypre_TMemcpy(&sum_var, shared_sum_var, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);    \
 }
 
 /* Reduction BoxLoop2 */
@@ -1530,9 +1512,9 @@ else                                                            \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
    hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
-   HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);                    \
-   hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);    \
-   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                     \
+   HYPRE_Real *shared_sum_var = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);                    \
+   hypre_TMemcpy(shared_sum_var, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);    \
+   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &hypre_sycl_sum)                          \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -1543,8 +1525,8 @@ else                                                            \
 
 #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var)                                                \
       }                                                                                               \
-   }, hypre__tot, hypre_sycl_sum);                                                                    \
-   hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);    \
+   }, hypre__tot, shared_sum_var);                                                                    \
+   hypre_TMemcpy(&sum_var, shared_sum_var, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);    \
 }
 
 /* Plain parallel_for loop */
@@ -1569,6 +1551,8 @@ else                                                            \
 
 #define hypre_BoxLoopBlock()       0
 
+#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
+#define hypre_BoxLoop0End        hypre_newBoxLoop0End
 #define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
 #define hypre_BoxLoop1End        hypre_newBoxLoop1End
 #define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index af8f1d9f9d..b8a61a07ea 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -40,69 +40,36 @@ void
 BoxLoopforall( HYPRE_Int length,
                LOOP_BODY loop_body)
 {
-   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
-   /* WM: TODO: uncomment above and remove below */
-   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
-
-   if (exec_policy == HYPRE_EXEC_HOST)
-   {
-/* WM: todo - is this really necessary, even? */
-/* #ifdef HYPRE_USING_OPENMP */
-/* #pragma omp parallel for HYPRE_SMP_SCHEDULE */
-/* #endif */
-/*       for (HYPRE_Int idx = 0; idx < length; idx++) */
-/*       { */
-/*          loop_body(idx); */
-/*       } */
-   }
-   else if (exec_policy == HYPRE_EXEC_DEVICE)
+   if (length <= 0)
    {
-      /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */
-      const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
-      const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
-
-      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
-         {
-            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body);
-         }).wait_and_throw();
+      return;
    }
+   const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
+   const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
+
+   hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
+      {
+         cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), loop_body);
+      }).wait_and_throw();
 }
 
 template<typename LOOP_BODY>
 void
 ReductionBoxLoopforall( LOOP_BODY  loop_body,
                         HYPRE_Int length,
-                        HYPRE_Real *hypre_sycl_sum )
+                        HYPRE_Real *shared_sum_var )
 {
    if (length <= 0)
    {
       return;
    }
+   const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
+   const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
-   /* HYPRE_ExecutionPolicy exec_policy = hypre_HandleStructExecPolicy(hypre_handle()); */
-   /* WM: TODO: uncomment above and remove below */
-   HYPRE_ExecutionPolicy exec_policy = HYPRE_EXEC_DEVICE;
-
-   if (exec_policy == HYPRE_EXEC_HOST)
-   {
-      /* WM: todo - is this really necessary, even? */
-      /* for (HYPRE_Int idx = 0; idx < length; idx++) */
-      /* { */
-      /*    loop_body(idx, reducer); */
-      /* } */
-   }
-   else if (exec_policy == HYPRE_EXEC_DEVICE)
-   {
-      /* WM: question - is it better in sycl to launch parallel_for with blocks in this way as we do for cuda? */
-      /* NOTE: in the cuda version, there is further manipulation of bDim and gDim that I don't include here */
-      const sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
-      const sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
-
-      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
-         {
-            cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::reduction(hypre_sycl_sum, std::plus<>()), loop_body);
-         }).wait_and_throw();
-   }
+   hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh)
+      {
+         cgh.parallel_for(sycl::nd_range<1>(gDim*bDim, bDim), sycl::reduction(shared_sum_var, std::plus<>()), loop_body);
+      }).wait_and_throw();
 }
 
 #ifdef __cplusplus
@@ -225,6 +192,21 @@ else                                                            \
  * Boxloops
  *********************************************************************/
 
+/* BoxLoop 0 */
+#define hypre_newBoxLoop0Begin(ndim, loop_size)                                                       \
+{                                                                                                     \
+   hypre_newBoxLoopInit(ndim, loop_size);                                                             \
+   BoxLoopforall(hypre__tot, [=] (sycl::nd_item<1> item)                                              \
+   {                                                                                                  \
+      HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
+      if (idx < hypre__tot)                                                                           \
+      {                                                                                               \
+
+#define hypre_newBoxLoop0End()                                                                        \
+      }                                                                                               \
+   });                                                                                                \
+}
+
 /* BoxLoop 1 */
 #define hypre_newBoxLoop1Begin(ndim, loop_size, dbox1, start1, stride1, i1)                           \
 {                                                                                                     \
@@ -353,9 +335,9 @@ else                                                            \
 {                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
-   HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);                    \
-   hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);    \
-   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                     \
+   HYPRE_Real *shared_sum_var = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);                    \
+   hypre_TMemcpy(shared_sum_var, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);    \
+   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &hypre_sycl_sum)                          \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -365,8 +347,8 @@ else                                                            \
 
 #define hypre_newBoxLoop1ReductionEnd(i1, sum_var)                                                    \
       }                                                                                               \
-   }, hypre__tot, hypre_sycl_sum);                                                                    \
-   hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);    \
+   }, hypre__tot, shared_sum_var);                                                                    \
+   hypre_TMemcpy(&sum_var, shared_sum_var, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);    \
 }
 
 /* Reduction BoxLoop2 */
@@ -378,9 +360,9 @@ else                                                            \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
    hypre_BoxLoopDataDeclareK(1, ndim, loop_size, dbox1, start1, stride1);                             \
    hypre_BoxLoopDataDeclareK(2, ndim, loop_size, dbox2, start2, stride2);                             \
-   HYPRE_Real *hypre_sycl_sum = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);                    \
-   hypre_TMemcpy(hypre_sycl_sum, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);    \
-   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &sum)                                     \
+   HYPRE_Real *shared_sum_var = hypre_CTAlloc(HYPRE_Real, 1, HYPRE_MEMORY_DEVICE);                    \
+   hypre_TMemcpy(shared_sum_var, &sum_var, HYPRE_Real, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);    \
+   ReductionBoxLoopforall( [=] (sycl::nd_item<1> item, auto &hypre_sycl_sum)                          \
    {                                                                                                  \
       HYPRE_Int idx = (HYPRE_Int) item.get_global_linear_id();                                        \
       if (idx < hypre__tot)                                                                           \
@@ -391,8 +373,8 @@ else                                                            \
 
 #define hypre_newBoxLoop2ReductionEnd(i1, i2, sum_var)                                                \
       }                                                                                               \
-   }, hypre__tot, hypre_sycl_sum);                                                                    \
-   hypre_TMemcpy(&sum_var, hypre_sycl_sum, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);    \
+   }, hypre__tot, shared_sum_var);                                                                    \
+   hypre_TMemcpy(&sum_var, shared_sum_var, HYPRE_Real, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);    \
 }
 
 /* Plain parallel_for loop */
@@ -417,6 +399,8 @@ else                                                            \
 
 #define hypre_BoxLoopBlock()       0
 
+#define hypre_BoxLoop0Begin      hypre_newBoxLoop0Begin
+#define hypre_BoxLoop0End        hypre_newBoxLoop0End
 #define hypre_BoxLoop1Begin      hypre_newBoxLoop1Begin
 #define hypre_BoxLoop1End        hypre_newBoxLoop1End
 #define hypre_BoxLoop2Begin      hypre_newBoxLoop2Begin
diff --git a/src/struct_mv/struct_innerprod.c b/src/struct_mv/struct_innerprod.c
index cfef661cb0..7d1c7e15ba 100644
--- a/src/struct_mv/struct_innerprod.c
+++ b/src/struct_mv/struct_innerprod.c
@@ -90,7 +90,7 @@ hypre_StructInnerProd( hypre_StructVector *x,
       {
          HYPRE_Real tmp = xp[xi] * hypre_conj(yp[yi]);
 #if defined(HYPRE_USING_SYCL)
-         sum += tmp;
+         hypre_sycl_sum += tmp;
 #else
          box_sum += tmp;
 #endif

From 4d303d3d5f3f6bd4ce3a9d778158a1cb525fdd8e Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Wed, 3 Nov 2021 22:14:40 +0000
Subject: [PATCH 29/44] Some placeholders and changes to allow ij interface to
 run on the host

---
 src/parcsr_ls/par_lr_interp.c     | 30 ++++++++++++++----------------
 src/parcsr_ls/par_mod_lr_interp.c | 17 ++++++++---------
 src/seq_mv/csr_matvec.c           |  4 ++++
 src/seq_mv/csr_matvec_device.c    | 17 +++++++++++++++++
 src/seq_mv/protos.h               |  3 +++
 src/seq_mv/seq_mv.h               |  3 +++
 6 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/src/parcsr_ls/par_lr_interp.c b/src/parcsr_ls/par_lr_interp.c
index 9dce84705c..da45ec1a4b 100644
--- a/src/parcsr_ls/par_lr_interp.c
+++ b/src/parcsr_ls/par_lr_interp.c
@@ -5283,22 +5283,21 @@ hypre_BoomerAMGBuildExtInterp(hypre_ParCSRMatrix *A, HYPRE_Int *CF_marker,
    hypre_GpuProfilingPushRange("ExtInterp");
 #endif
 
-   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
-
    HYPRE_Int ierr = 0;
 
-   if (exec == HYPRE_EXEC_HOST)
-   {
-      ierr = hypre_BoomerAMGBuildExtInterpHost(A,CF_marker,S,num_cpts_global,num_functions,dof_func,
-                                               debug_flag,trunc_factor,max_elmts,P_ptr);
-   }
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
-   else
+   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
+   if (exec == HYPRE_EXEC_DEVICE)
    {
       ierr = hypre_BoomerAMGBuildExtInterpDevice(A,CF_marker,S,num_cpts_global,num_functions,dof_func,
                                                  debug_flag,trunc_factor,max_elmts,P_ptr);
    }
+   else
 #endif
+   {
+      ierr = hypre_BoomerAMGBuildExtInterpHost(A,CF_marker,S,num_cpts_global,num_functions,dof_func,
+                                               debug_flag,trunc_factor,max_elmts,P_ptr);
+   }
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
    hypre_GpuProfilingPopRange();
@@ -5325,22 +5324,21 @@ hypre_BoomerAMGBuildExtPIInterp(hypre_ParCSRMatrix   *A,
    hypre_GpuProfilingPushRange("ExtPIInterp");
 #endif
 
-   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
-
    HYPRE_Int ierr = 0;
 
-   if (exec == HYPRE_EXEC_HOST)
-   {
-      ierr = hypre_BoomerAMGBuildExtPIInterpHost(A, CF_marker, S, num_cpts_global, num_functions, dof_func,
-                                                 debug_flag, trunc_factor, max_elmts, P_ptr);
-   }
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
-   else
+   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
+   if (exec == HYPRE_EXEC_DEVICE)
    {
       ierr = hypre_BoomerAMGBuildExtPIInterpDevice(A, CF_marker, S, num_cpts_global, num_functions, dof_func,
                                                    debug_flag, trunc_factor, max_elmts, P_ptr);
    }
+   else
 #endif
+   {
+      ierr = hypre_BoomerAMGBuildExtPIInterpHost(A, CF_marker, S, num_cpts_global, num_functions, dof_func,
+                                                 debug_flag, trunc_factor, max_elmts, P_ptr);
+   }
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
    hypre_GpuProfilingPopRange();
diff --git a/src/parcsr_ls/par_mod_lr_interp.c b/src/parcsr_ls/par_mod_lr_interp.c
index bc0ecbe031..7cd946acd8 100644
--- a/src/parcsr_ls/par_mod_lr_interp.c
+++ b/src/parcsr_ls/par_mod_lr_interp.c
@@ -1170,23 +1170,22 @@ hypre_BoomerAMGBuildModExtPIInterp(hypre_ParCSRMatrix  *A,
    hypre_GpuProfilingPushRange("ModExtPIInterp");
 #endif
 
-   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
-
    HYPRE_Int ierr = 0;
 
-   if (exec == HYPRE_EXEC_HOST)
-   {
-      ierr = hypre_BoomerAMGBuildModExtPIInterpHost(A, CF_marker, S, num_cpts_global,
-                                                    debug_flag, num_functions, dof_func,
-                                                    trunc_factor, max_elmts, P_ptr);
-   }
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
-   else
+   HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_ParCSRMatrixMemoryLocation(A) );
+   if (exec == HYPRE_EXEC_DEVICE)
    {
       ierr = hypre_BoomerAMGBuildExtPIInterpDevice(A, CF_marker, S, num_cpts_global, 1, NULL,
                                                    debug_flag, trunc_factor, max_elmts, P_ptr);
    }
+   else
 #endif
+   {
+      ierr = hypre_BoomerAMGBuildModExtPIInterpHost(A, CF_marker, S, num_cpts_global,
+                                                    debug_flag, num_functions, dof_func,
+                                                    trunc_factor, max_elmts, P_ptr);
+   }
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
    hypre_GpuProfilingPopRange();
diff --git a/src/seq_mv/csr_matvec.c b/src/seq_mv/csr_matvec.c
index 90f57d44da..b86d1431bd 100644
--- a/src/seq_mv/csr_matvec.c
+++ b/src/seq_mv/csr_matvec.c
@@ -712,6 +712,10 @@ hypre_CSRMatrixMatvecOutOfPlace( HYPRE_Complex    alpha,
 
 #if defined(HYPRE_USING_GPU)
    HYPRE_ExecutionPolicy exec = hypre_GetExecPolicy1( hypre_CSRMatrixMemoryLocation(A) );
+/* WM: TODO - remove after sycl implementation in place */
+#if defined(HYPRE_USING_SYCL)
+   exec = HYPRE_EXEC_HOST;
+#endif
    if (exec == HYPRE_EXEC_DEVICE)
    {
       ierr = hypre_CSRMatrixMatvecDevice(0, alpha, A, x, beta, b, y, offset);
diff --git a/src/seq_mv/csr_matvec_device.c b/src/seq_mv/csr_matvec_device.c
index 5ead8cb9b8..a6a09363e6 100644
--- a/src/seq_mv/csr_matvec_device.c
+++ b/src/seq_mv/csr_matvec_device.c
@@ -50,6 +50,8 @@ hypre_CSRMatrixMatvecDevice2( HYPRE_Int        trans,
    hypre_CSRMatrixMatvecOMPOffload(trans, alpha, A, x, beta, y, offset);
 #elif defined(HYPRE_USING_ROCSPARSE)
    hypre_CSRMatrixMatvecRocsparse(trans, alpha, A, x, beta, y, offset);
+#elif defined(HYPRE_USING_ONEMKLSPARSE)
+   hypre_CSRMatrixMatvecOnemklsparse(trans, alpha, A, x, beta, y, offset);
 #else // #ifdef HYPRE_USING_CUSPARSE
 // WM: TODO: commenting this out for now, but put it back after sycl impelentation is done
 /* #error HYPRE SPMV TODO */
@@ -314,5 +316,20 @@ hypre_CSRMatrixMatvecRocsparse( HYPRE_Int        trans,
 }
 #endif // #if defined(HYPRE_USING_ROCSPARSE)
 
+#if defined(HYPRE_USING_ONEMKLSPARSE)
+HYPRE_Int
+hypre_CSRMatrixMatvecOnemklsparse( HYPRE_Int        trans,
+                                   HYPRE_Complex    alpha,
+                                   hypre_CSRMatrix *A,
+                                   hypre_Vector    *x,
+                                   HYPRE_Complex    beta,
+                                   hypre_Vector    *y,
+                                   HYPRE_Int        offset )
+{
+/* WM: TODO */
+   return hypre_error_flag;
+}
+#endif // #if defined(HYPRE_USING_ROCSPARSE)
+
 #endif // #if defined(HYPRE_USING_GPU)
 
diff --git a/src/seq_mv/protos.h b/src/seq_mv/protos.h
index f52e3836ed..898efc6154 100644
--- a/src/seq_mv/protos.h
+++ b/src/seq_mv/protos.h
@@ -89,6 +89,7 @@ HYPRE_Int hypre_CSRMatrixMatvecCusparseNewAPI( HYPRE_Int trans, HYPRE_Complex al
 HYPRE_Int hypre_CSRMatrixMatvecCusparseOldAPI( HYPRE_Int trans, HYPRE_Complex alpha, hypre_CSRMatrix *A, hypre_Vector *x, HYPRE_Complex beta, hypre_Vector *y, HYPRE_Int offset );
 HYPRE_Int hypre_CSRMatrixMatvecOMPOffload (HYPRE_Int trans, HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y, HYPRE_Int offset );
 HYPRE_Int hypre_CSRMatrixMatvecRocsparse (HYPRE_Int trans, HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y, HYPRE_Int offset );
+HYPRE_Int hypre_CSRMatrixMatvecOnemklsparse (HYPRE_Int trans, HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y, HYPRE_Int offset );
 
 /* genpart.c */
 HYPRE_Int hypre_GeneratePartitioning ( HYPRE_BigInt length , HYPRE_Int num_procs , HYPRE_BigInt **part_ptr );
@@ -196,6 +197,8 @@ HYPRE_Int hypreDevice_CSRSpTransCusparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnz
 
 HYPRE_Int hypreDevice_CSRSpTransRocsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data);
 
+HYPRE_Int hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data);
+
 HYPRE_Int hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, hypre_CSRMatrix *B, hypre_CSRMatrix **C_ptr);
 
 HYPRE_Int hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Int *d_ib, HYPRE_Int *d_jb, HYPRE_Int *d_rc);
diff --git a/src/seq_mv/seq_mv.h b/src/seq_mv/seq_mv.h
index 2964c08a03..257b9a7b38 100644
--- a/src/seq_mv/seq_mv.h
+++ b/src/seq_mv/seq_mv.h
@@ -361,6 +361,7 @@ HYPRE_Int hypre_CSRMatrixMatvecCusparseNewAPI( HYPRE_Int trans, HYPRE_Complex al
 HYPRE_Int hypre_CSRMatrixMatvecCusparseOldAPI( HYPRE_Int trans, HYPRE_Complex alpha, hypre_CSRMatrix *A, hypre_Vector *x, HYPRE_Complex beta, hypre_Vector *y, HYPRE_Int offset );
 HYPRE_Int hypre_CSRMatrixMatvecOMPOffload (HYPRE_Int trans, HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y, HYPRE_Int offset );
 HYPRE_Int hypre_CSRMatrixMatvecRocsparse (HYPRE_Int trans, HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y, HYPRE_Int offset );
+HYPRE_Int hypre_CSRMatrixMatvecOnemklsparse (HYPRE_Int trans, HYPRE_Complex alpha , hypre_CSRMatrix *A , hypre_Vector *x , HYPRE_Complex beta , hypre_Vector *y, HYPRE_Int offset );
 
 /* genpart.c */
 HYPRE_Int hypre_GeneratePartitioning ( HYPRE_BigInt length , HYPRE_Int num_procs , HYPRE_BigInt **part_ptr );
@@ -468,6 +469,8 @@ HYPRE_Int hypreDevice_CSRSpTransCusparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnz
 
 HYPRE_Int hypreDevice_CSRSpTransRocsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data);
 
+HYPRE_Int hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data);
+
 HYPRE_Int hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, hypre_CSRMatrix *B, hypre_CSRMatrix **C_ptr);
 
 HYPRE_Int hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Int *d_ib, HYPRE_Int *d_jb, HYPRE_Int *d_rc);

From 99c5d9d72c1290375b21963e34699e98ba26b573 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Thu, 4 Nov 2021 21:20:58 +0000
Subject: [PATCH 30/44] Add cmake compilation

---
 src/CMakeLists.txt                          | 58 ++++++++++++++++++++-
 src/IJ_mv/CMakeLists.txt                    |  8 +--
 src/config/HYPREConfig.cmake.in             |  1 +
 src/config/HYPRE_config.h.cmake.in          |  3 ++
 src/config/cmake/HYPRE_CMakeUtilities.cmake |  6 +++
 src/parcsr_ls/CMakeLists.txt                |  8 +--
 src/parcsr_mv/CMakeLists.txt                |  8 +--
 src/seq_mv/CMakeLists.txt                   |  8 +--
 src/sstruct_ls/CMakeLists.txt               |  8 +--
 src/sstruct_mv/CMakeLists.txt               |  8 +--
 src/struct_ls/CMakeLists.txt                |  8 +--
 src/struct_mv/CMakeLists.txt                |  8 +--
 src/utilities/CMakeLists.txt                |  8 +--
 13 files changed, 102 insertions(+), 38 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7887360b9e..94a92e72b1 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -101,9 +101,11 @@ option(HYPRE_BUILD_TESTS             "Build tests" OFF)
 option(HYPRE_USING_HOST_MEMORY       "Use host memory" ON)
 set(HYPRE_WITH_EXTRA_CFLAGS       "" CACHE STRING "Define extra C compile flags")
 set(HYPRE_WITH_EXTRA_CXXFLAGS     "" CACHE STRING "Define extra CXX compile flags")
-# CUDA options
+# GPU options
 option(HYPRE_WITH_CUDA               "Use CUDA. Require cuda-8.0 or higher" OFF)
+option(HYPRE_WITH_SYCL               "Use SYCL" OFF)
 option(HYPRE_ENABLE_UNIFIED_MEMORY   "Use unified memory for allocating the memory" OFF)
+# CUDA options
 option(HYPRE_ENABLE_CUDA_STREAMS     "Use CUDA streams" ON)
 option(HYPRE_ENABLE_CUSPARSE         "Use cuSPARSE" ON)
 option(HYPRE_ENABLE_DEVICE_POOL      "Use device memory pool" OFF)
@@ -280,6 +282,54 @@ if (HYPRE_WITH_CUDA)
   endif (CMAKE_CUDA_COMPILER)
 endif (HYPRE_WITH_CUDA)
 
+# SYCL
+if (HYPRE_WITH_SYCL)
+  enable_language(CXX)
+  message(STATUS "Enabled support for CXX.")
+
+  # Enforce C++17
+  if (NOT CMAKE_CXX_STANDARD OR CMAKE_CXX_STANDARD LESS 17)
+    set(CMAKE_CXX_STANDARD 17)
+  endif ()
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+  message(STATUS "Using CXX standard: c++${CMAKE_CXX_STANDARD}")
+
+  # Set CXX compiler to dpcpp
+  set(CMAKE_CXX_COMPILER "dpcpp")
+  # WM: try with/without the line below
+  # set(CMAKE_LINKER "dpcpp")
+
+  # Add any extra CXX compiler flags HYPRE_WITH_EXTRA_CXXFLAGS
+  if (NOT HYPRE_WITH_EXTRA_CXXFLAGS STREQUAL "")
+    string(REPLACE " " ";" HYPRE_WITH_EXTRA_CXXFLAGS "${HYPRE_WITH_EXTRA_CXXFLAGS}")
+    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${HYPRE_WITH_EXTRA_CXXFLAGS}>")
+  endif ()
+
+  set(HYPRE_USING_SYCL ON CACHE BOOL "" FORCE)
+  set(HYPRE_USING_GPU ON CACHE BOOL "" FORCE)
+
+  if (HYPRE_ENABLE_UNIFIED_MEMORY)
+    set(HYPRE_USING_UNIFIED_MEMORY ON CACHE BOOL "" FORCE)
+  else ()
+    set(HYPRE_USING_DEVICE_MEMORY ON CACHE BOOL "" FORCE)
+  endif ()
+
+  # Check if examples are enabled, but not unified memory
+  if (HYPRE_BUILD_EXAMPLES AND NOT HYPRE_ENABLE_UNIFIED_MEMORY)
+    message(WARNING "Running the examples on GPUs requires Unified Memory!
+      Examples will not be built!")
+    set(HYPRE_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
+  endif ()
+
+  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-fsycl>")
+  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-fsycl-unnamed-lambda>")
+  add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:-fsycl-device-code-split=per_kernel>")
+
+  set(HYPRE_USING_HOST_MEMORY OFF CACHE BOOL "" FORCE)
+
+endif (HYPRE_WITH_SYCL)
+
 # Add any extra C compiler flags HYPRE_WITH_EXTRA_CFLAGS
 if (NOT HYPRE_WITH_EXTRA_CFLAGS STREQUAL "")
   string(REPLACE " " ";" HYPRE_WITH_EXTRA_CFLAGS "${HYPRE_WITH_EXTRA_CFLAGS}")
@@ -397,7 +447,11 @@ target_include_directories(${PROJECT_NAME} PUBLIC
   )
 
 if (HYPRE_USING_CUDA)
-  set_source_files_properties(${HYPRE_CUDA_SOURCES} PROPERTIES LANGUAGE CUDA)
+  set_source_files_properties(${HYPRE_GPU_SOURCES} PROPERTIES LANGUAGE CUDA)
+endif ()
+
+if (HYPRE_USING_SYCL)
+  set_source_files_properties(${HYPRE_GPU_SOURCES} PROPERTIES LANGUAGE CXX)
 endif ()
 
 # Set MPI compile flags
diff --git a/src/IJ_mv/CMakeLists.txt b/src/IJ_mv/CMakeLists.txt
index 5a7c4d5ec1..ba491c70cf 100644
--- a/src/IJ_mv/CMakeLists.txt
+++ b/src/IJ_mv/CMakeLists.txt
@@ -34,13 +34,13 @@ target_sources(${PROJECT_NAME}
           ${HDRS}
 )
 
-if (HYPRE_USING_CUDA)
-  set(CUDA_SRCS
+if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL)
+  set(GPU_SRCS
     IJMatrix_parcsr_device.c
     IJVector_parcsr_device.c
   )
-  convert_filenames_to_full_paths(CUDA_SRCS)
-  set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE)
+  convert_filenames_to_full_paths(GPU_SRCS)
+  set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE)
 endif ()
 
 convert_filenames_to_full_paths(HDRS)
diff --git a/src/config/HYPREConfig.cmake.in b/src/config/HYPREConfig.cmake.in
index d9d19cc267..1445fec262 100644
--- a/src/config/HYPREConfig.cmake.in
+++ b/src/config/HYPREConfig.cmake.in
@@ -28,6 +28,7 @@ set(HYPRE_BUILD_EXAMPLES @HYPRE_BUILD_EXAMPLES@)
 set(HYPRE_BUILD_TESTS @HYPRE_BUILD_TESTS@)
 set(HYPRE_USING_HOST_MEMORY @HYPRE_USING_HOST_MEMORY@)
 set(HYPRE_WITH_CUDA @HYPRE_WITH_CUDA@)
+set(HYPRE_WITH_SYCL @HYPRE_WITH_SYCL@)
 set(HYPRE_ENABLE_UNIFIED_MEMORY @HYPRE_ENABLE_UNIFIED_MEMORY@)
 set(HYPRE_ENABLE_CUDA_STREAMS @HYPRE_ENABLE_CUDA_STREAMS@)
 set(HYPRE_ENABLE_CUSPARSE @HYPRE_ENABLE_CUSPARSE@)
diff --git a/src/config/HYPRE_config.h.cmake.in b/src/config/HYPRE_config.h.cmake.in
index 86006a16bc..eb22ae7336 100644
--- a/src/config/HYPRE_config.h.cmake.in
+++ b/src/config/HYPRE_config.h.cmake.in
@@ -67,6 +67,9 @@
 /* Use if executing on device with CUDA */
 #cmakedefine HYPRE_USING_CUDA 1
 
+/* Use if executing on device with SYCL */
+#cmakedefine HYPRE_USING_SYCL 1
+
 /* Use cuBLAS */
 #cmakedefine HYPRE_USING_CUBLAS 1
 
diff --git a/src/config/cmake/HYPRE_CMakeUtilities.cmake b/src/config/cmake/HYPRE_CMakeUtilities.cmake
index 0a1e8c8be2..97e11c3c1a 100644
--- a/src/config/cmake/HYPRE_CMakeUtilities.cmake
+++ b/src/config/cmake/HYPRE_CMakeUtilities.cmake
@@ -25,6 +25,12 @@ function(add_hypre_executables EXE_SRCS)
       set_source_files_properties(${SRC_FILENAME} PROPERTIES LANGUAGE CUDA)
     endif (HYPRE_USING_CUDA)
 
+    if (HYPRE_USING_SYCL)
+      # If SYCL is enabled, tag source files to be compiled with dpcpp.
+      set_source_files_properties(${SRC_FILENAME} PROPERTIES LANGUAGE CXX)
+    endif (HYPRE_USING_SYCL)
+
+
     string(REPLACE ".c" "" EXE_NAME ${SRC_FILENAME})
     # Actually add the exe
     add_executable(${EXE_NAME} ${SRC_FILE})
diff --git a/src/parcsr_ls/CMakeLists.txt b/src/parcsr_ls/CMakeLists.txt
index 045dea2545..8ce5945fad 100644
--- a/src/parcsr_ls/CMakeLists.txt
+++ b/src/parcsr_ls/CMakeLists.txt
@@ -143,8 +143,8 @@ target_sources(${PROJECT_NAME}
           ${HDRS}
 )
 
-if (HYPRE_USING_CUDA)
-  set(CUDA_SRCS
+if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL)
+  set(GPU_SRCS
     ams.c
     ads.c
     ame.c
@@ -167,8 +167,8 @@ if (HYPRE_USING_CUDA)
     par_2s_interp_device.c
     par_relax_device.c
   )
-  convert_filenames_to_full_paths(CUDA_SRCS)
-  set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE)
+  convert_filenames_to_full_paths(GPU_SRCS)
+  set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE)
 endif ()
 
 convert_filenames_to_full_paths(HDRS)
diff --git a/src/parcsr_mv/CMakeLists.txt b/src/parcsr_mv/CMakeLists.txt
index 6c40d366d1..ad1eca2fc4 100644
--- a/src/parcsr_mv/CMakeLists.txt
+++ b/src/parcsr_mv/CMakeLists.txt
@@ -43,16 +43,16 @@ target_sources(${PROJECT_NAME}
           ${HDRS}
 )
 
-if (HYPRE_USING_CUDA)
-  set(CUDA_SRCS
+if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL)
+  set(GPU_SRCS
     par_csr_matvec.c
     par_csr_fffc_device.c
     par_csr_matop_device.c
     par_csr_triplemat_device.c
     par_vector_device.c
   )
-  convert_filenames_to_full_paths(CUDA_SRCS)
-  set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE)
+  convert_filenames_to_full_paths(GPU_SRCS)
+  set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE)
 endif ()
 
 convert_filenames_to_full_paths(HDRS)
diff --git a/src/seq_mv/CMakeLists.txt b/src/seq_mv/CMakeLists.txt
index 80942d36bf..af06738e3f 100644
--- a/src/seq_mv/CMakeLists.txt
+++ b/src/seq_mv/CMakeLists.txt
@@ -43,8 +43,8 @@ target_sources(${PROJECT_NAME}
           ${HDRS}
 )
 
-if (HYPRE_USING_CUDA)
-  set(CUDA_SRCS
+if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL)
+  set(GPU_SRCS
     csr_matop_device.c
     csr_matrix_cuda_utils.c
     csr_matvec_device.c
@@ -62,8 +62,8 @@ if (HYPRE_USING_CUDA)
     csr_sptrans_device.c
     vector.c
   )
-  convert_filenames_to_full_paths(CUDA_SRCS)
-  set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE)
+  convert_filenames_to_full_paths(GPU_SRCS)
+  set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE)
 endif ()
 
 convert_filenames_to_full_paths(HDRS)
diff --git a/src/sstruct_ls/CMakeLists.txt b/src/sstruct_ls/CMakeLists.txt
index 344360ce2b..d11a0908dd 100644
--- a/src/sstruct_ls/CMakeLists.txt
+++ b/src/sstruct_ls/CMakeLists.txt
@@ -79,8 +79,8 @@ target_sources(${PROJECT_NAME}
           ${HDRS}
 )
 
-if (HYPRE_USING_CUDA)
-  set(CUDA_SRCS
+if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL)
+  set(GPU_SRCS
     fac_amr_fcoarsen.c
     fac_amr_rap.c
     fac_restrict2.c
@@ -88,8 +88,8 @@ if (HYPRE_USING_CUDA)
     fac_zero_stencilcoef.c
     node_relax.c
   )
-  convert_filenames_to_full_paths(CUDA_SRCS)
-  set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE)
+  convert_filenames_to_full_paths(GPU_SRCS)
+  set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE)
 endif ()
 
 convert_filenames_to_full_paths(HDRS)
diff --git a/src/sstruct_mv/CMakeLists.txt b/src/sstruct_mv/CMakeLists.txt
index 8aeda925f9..013ffb6262 100644
--- a/src/sstruct_mv/CMakeLists.txt
+++ b/src/sstruct_mv/CMakeLists.txt
@@ -36,13 +36,13 @@ target_sources(${PROJECT_NAME}
           ${HDRS}
 )
 
-if (HYPRE_USING_CUDA)
-  set(CUDA_SRCS
+if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL)
+  set(GPU_SRCS
     sstruct_matrix.c
     sstruct_vector.c
   )
-  convert_filenames_to_full_paths(CUDA_SRCS)
-  set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE)
+  convert_filenames_to_full_paths(GPU_SRCS)
+  set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE)
 endif ()
 
 convert_filenames_to_full_paths(HDRS)
diff --git a/src/struct_ls/CMakeLists.txt b/src/struct_ls/CMakeLists.txt
index 11d51c9eed..4c08db3a63 100644
--- a/src/struct_ls/CMakeLists.txt
+++ b/src/struct_ls/CMakeLists.txt
@@ -79,8 +79,8 @@ target_sources(${PROJECT_NAME}
           ${HDRS}
 )
 
-if (HYPRE_USING_CUDA)
-  set(CUDA_SRCS
+if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL)
+  set(GPU_SRCS
     cyclic_reduction.c
     HYPRE_struct_int.c
     HYPRE_struct_pcg.c
@@ -108,8 +108,8 @@ if (HYPRE_USING_CUDA)
     sparse_msg_interp.c
     sparse_msg_restrict.c
   )
-  convert_filenames_to_full_paths(CUDA_SRCS)
-  set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE)
+  convert_filenames_to_full_paths(GPU_SRCS)
+  set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE)
 endif ()
 
 convert_filenames_to_full_paths(HDRS)
diff --git a/src/struct_mv/CMakeLists.txt b/src/struct_mv/CMakeLists.txt
index b77c886313..8cc286f522 100644
--- a/src/struct_mv/CMakeLists.txt
+++ b/src/struct_mv/CMakeLists.txt
@@ -44,8 +44,8 @@ target_sources(${PROJECT_NAME}
           ${HDRS}
 )
 
-if (HYPRE_USING_CUDA)
-  set(CUDA_SRCS
+if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL)
+  set(GPU_SRCS
     struct_axpy.c
     struct_communication.c
     struct_copy.c
@@ -55,8 +55,8 @@ if (HYPRE_USING_CUDA)
     struct_scale.c
     struct_vector.c
   )
-  convert_filenames_to_full_paths(CUDA_SRCS)
-  set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE)
+  convert_filenames_to_full_paths(GPU_SRCS)
+  set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE)
 endif ()
 
 convert_filenames_to_full_paths(HDRS)
diff --git a/src/utilities/CMakeLists.txt b/src/utilities/CMakeLists.txt
index ef0f2923d4..3fbd1eefb6 100644
--- a/src/utilities/CMakeLists.txt
+++ b/src/utilities/CMakeLists.txt
@@ -51,8 +51,8 @@ target_sources(${PROJECT_NAME}
           ${HDRS}
 )
 
-if (HYPRE_USING_CUDA)
-  set(CUDA_SRCS
+if (HYPRE_USING_CUDA OR HYPRE_USING_SYCL)
+  set(GPU_SRCS
     HYPRE_handle.c
     device_utils.c
     handle.c
@@ -62,8 +62,8 @@ if (HYPRE_USING_CUDA)
     omp_device.c
     nvtx.c
   )
-  convert_filenames_to_full_paths(CUDA_SRCS)
-  set(HYPRE_CUDA_SOURCES ${HYPRE_CUDA_SOURCES} ${CUDA_SRCS} PARENT_SCOPE)
+  convert_filenames_to_full_paths(GPU_SRCS)
+  set(HYPRE_GPU_SOURCES ${HYPRE_GPU_SOURCES} ${GPU_SRCS} PARENT_SCOPE)
 endif ()
 
 convert_filenames_to_full_paths(HDRS)

From ec8c5de3f50432fc9f8253059e14eae9a3519021 Mon Sep 17 00:00:00 2001
From: Wayne Mitchell <mitchell82@llnl.gov>
Date: Thu, 4 Nov 2021 22:54:56 +0000
Subject: [PATCH 31/44] Some code cleanup

---
 src/CMakeLists.txt                 |   2 -
 src/seq_mv/csr_matvec_device.c     |   5 +-
 src/struct_mv/_hypre_struct_mv.hpp |   4 -
 src/struct_mv/boxloop_sycl.h       |   4 -
 src/utilities/_hypre_utilities.hpp | 132 +----------------------------
 src/utilities/device_utils.c       |   1 -
 src/utilities/device_utils.h       | 132 +----------------------------
 7 files changed, 5 insertions(+), 275 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 94a92e72b1..aa545dfd1a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -297,8 +297,6 @@ if (HYPRE_WITH_SYCL)
 
   # Set CXX compiler to dpcpp
   set(CMAKE_CXX_COMPILER "dpcpp")
-  # WM: try with/without the line below
-  # set(CMAKE_LINKER "dpcpp")
 
   # Add any extra CXX compiler flags HYPRE_WITH_EXTRA_CXXFLAGS
   if (NOT HYPRE_WITH_EXTRA_CXXFLAGS STREQUAL "")
diff --git a/src/seq_mv/csr_matvec_device.c b/src/seq_mv/csr_matvec_device.c
index a6a09363e6..d36981c768 100644
--- a/src/seq_mv/csr_matvec_device.c
+++ b/src/seq_mv/csr_matvec_device.c
@@ -52,9 +52,10 @@ hypre_CSRMatrixMatvecDevice2( HYPRE_Int        trans,
    hypre_CSRMatrixMatvecRocsparse(trans, alpha, A, x, beta, y, offset);
 #elif defined(HYPRE_USING_ONEMKLSPARSE)
    hypre_CSRMatrixMatvecOnemklsparse(trans, alpha, A, x, beta, y, offset);
+// WM: TODO: remove trivial HYPRE_USING_SYCL branch after onemlksparse implementation is in
+#elif defined(HYPRE_USING_SYCL)
 #else // #ifdef HYPRE_USING_CUSPARSE
-// WM: TODO: commenting this out for now, but put it back after sycl impelentation is done
-/* #error HYPRE SPMV TODO */
+#error HYPRE SPMV TODO
 #endif
 
    return hypre_error_flag;
diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index d76cab3557..5224d03cab 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -1481,8 +1481,6 @@ else                                                            \
 
 
 /* Reduction BoxLoop1 */
-/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */
-/* Right now, it is hardcoded as a HYPRE_Real */
 #define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var)         \
 {                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
@@ -1504,8 +1502,6 @@ else                                                            \
 }
 
 /* Reduction BoxLoop2 */
-/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */
-/* Right now, it is hardcoded as a HYPRE_Real */
 #define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1,                  \
                                                       dbox2, start2, stride2, i2, sum_var)            \
 {                                                                                                     \
diff --git a/src/struct_mv/boxloop_sycl.h b/src/struct_mv/boxloop_sycl.h
index b8a61a07ea..db076f049b 100644
--- a/src/struct_mv/boxloop_sycl.h
+++ b/src/struct_mv/boxloop_sycl.h
@@ -329,8 +329,6 @@ else                                                            \
 
 
 /* Reduction BoxLoop1 */
-/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */
-/* Right now, it is hardcoded as a HYPRE_Real */
 #define hypre_newBoxLoop1ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1, sum_var)         \
 {                                                                                                     \
    hypre_newBoxLoopInit(ndim, loop_size);                                                             \
@@ -352,8 +350,6 @@ else                                                            \
 }
 
 /* Reduction BoxLoop2 */
-/* WM: todo - is there a better way to handle the passing of sum_var (the variable where we want the reduction to end up)? */
-/* Right now, it is hardcoded as a HYPRE_Real */
 #define hypre_newBoxLoop2ReductionBegin(ndim, loop_size, dbox1, start1, stride1, i1,                  \
                                                       dbox2, start2, stride2, i2, sum_var)            \
 {                                                                                                     \
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index 7914100efe..1ba80c8732 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -110,23 +110,8 @@ struct hypre_device_allocator
 
 #elif defined(HYPRE_USING_SYCL)
 
-/* WM: todo - if the include for CL/sycl.hpp is inside extern "C++" {}, I get problems with sycl reductions... totally strange, but true */
+/* WM: problems with this being inside extern C++ {} */
 /* #include <CL/sycl.hpp> */
-/* WM: todo - include below as necessary */
-/* #include <oneapi/dpl/execution> */
-/* #include <oneapi/dpl/algorithm> */
-/* #include <oneapi/dpl/iterator> */
-/* #include <oneapi/dpl/functional> */
-
-/* #include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if */
-
-/* #include <algorithm> */
-/* #include <numeric> */
-/* #include <functional> */
-/* #include <iterator> */
-
-/* #include <oneapi/mkl.hpp> */
-/* #include <oneapi/mkl/rng/device.hpp> */
 
 #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
 
@@ -285,7 +270,6 @@ struct hypre_DeviceData
 #endif
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
-   /* WM: question - what is the device_allocator? */
    hypre_device_allocator            device_allocator;
 #endif
 #if defined(HYPRE_USING_SYCL)
@@ -410,119 +394,7 @@ struct hypre_GpuMatData
 
 #endif //#if defined(HYPRE_USING_GPU)
 
-/* WM: todo - is this how I want to integrate the functionality below? Do I really need all this? */
-/* NOTE: It doesn't line up that nicely with the cuda/hip implementation since you need to pass item agrs */
 #if defined(HYPRE_USING_SYCL)
-/* return the number of work-items in current work-group */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_num_threads(sycl::nd_item<dim>& item)
-{
-  return item.get_group().get_local_linear_range();
-}
-
-/* return the flattened or linearlized work-item id in current work-group (not global)*/
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_thread_id(sycl::nd_item<dim>& item)
-{
-  return item.get_local_linear_id();
-}
-
-/* return the number of sub-groups in current work-group */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_num_warps(sycl::nd_item<dim>& item)
-{
-  return item.get_sub_group().get_group_range().get(0);
-}
-
-/* return the sub_group id in work-group */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_warp_id(sycl::nd_item<dim>& item)
-{
-  return item.get_sub_group().get_group_linear_id();
-}
-
-/* return the work-item lane id in a sub_group */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_lane_id(sycl::nd_item<dim>& item)
-{
-  return hypre_cuda_get_thread_id<dim>(item) & (item.get_sub_group().get_local_range().get(0)-1);
-}
-
-/* return the num of work_groups in nd_range */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_num_blocks(sycl::nd_item<dim>& item)
-{
-  // return item.get_group().get_group_linear_range(); // API available in SYCL 2020
-
-  switch (dim)
-  {
-  case 1:
-    return (item.get_group_range(0));
-  case 2:
-    return (item.get_group_range(0) * item.get_group_range(1));
-  case 3:
-    return (item.get_group_range(0) * item.get_group_range(1) * item.get_group_range(2));
-  }
-
-  return -1;
-}
-
-/* return the flattened or linearlized work-group id in nd_range */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_block_id(sycl::nd_item<dim>& item)
-{
-  return item.get_group_linear_id();
-}
-
-/* return the number of work-items in global iteration space*/
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_grid_num_threads(sycl::nd_item<dim>& item)
-{
-  switch (dim)
-  {
-  case 1:
-    return (item.get_global_range(0));
-  case 2:
-    return (item.get_global_range(0) * item.get_global_range(1));
-  case 3:
-    return (item.get_global_range(0) * item.get_global_range(1) * item.get_global_range(2));
-  }
-
-  return -1;
-}
-
-/* return the flattened work-item id in global iteration space */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_grid_thread_id(sycl::nd_item<dim>& item)
-{
-  return item.get_global_linear_id();
-}
-
-/* return the number of sub-groups in global iteration space */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_grid_num_warps(sycl::nd_item<dim>& item)
-{
-  return hypre_cuda_get_num_blocks<dim>(item) * hypre_cuda_get_num_warps<dim>(item);
-}
-
-/* return the flattened sub-group id in global iteration space */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_grid_warp_id(sycl::nd_item<dim>& item)
-{
-  return hypre_cuda_get_block_id<dim>(item) * hypre_cuda_get_num_warps<dim>(item) +
-    hypre_cuda_get_warp_id<dim>(item);
-}
 
 /* device_utils.c */
 sycl::range<1> hypre_GetDefaultDeviceBlockDimension();
@@ -572,8 +444,6 @@ using namespace thrust::placeholders;
 #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
 #elif defined(HYPRE_USING_HIP)
 #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() );  }
-#elif defined(HYPRE_USING_SYCL)
-/* WM: todo? used below in HYPRE_CUDA_LAUNCH2 */
 #endif
 #else // #if defined(HYPRE_DEBUG)
 #define GPU_LAUNCH_SYNC
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index 72a30c73be..5747fca82e 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -1228,7 +1228,6 @@ hypre_DeviceDataCreate()
    hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST);
 
 #if defined(HYPRE_USING_SYCL)
-   /* WM: does the default selector get a GPU if available? Having trouble with getting the device on frank, so temporarily just passing the default selector */
    hypre_DeviceDataDevice(data)                 = sycl::device(sycl::default_selector{});
    hypre_DeviceDataDeviceMaxWorkGroupSize(data) = hypre_DeviceDataDevice(data).get_info<sycl::info::device::max_work_group_size>();
 #else
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index 1dc3e0f0ff..2350c8c0e6 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -53,23 +53,8 @@
 
 #elif defined(HYPRE_USING_SYCL)
 
-/* WM: todo - if the include for CL/sycl.hpp is inside extern "C++" {}, I get problems with sycl reductions... totally strange, but true */
+/* WM: problems with this being inside extern C++ {} */
 /* #include <CL/sycl.hpp> */
-/* WM: todo - include below as necessary */
-/* #include <oneapi/dpl/execution> */
-/* #include <oneapi/dpl/algorithm> */
-/* #include <oneapi/dpl/iterator> */
-/* #include <oneapi/dpl/functional> */
-
-/* #include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if */
-
-/* #include <algorithm> */
-/* #include <numeric> */
-/* #include <functional> */
-/* #include <iterator> */
-
-/* #include <oneapi/mkl.hpp> */
-/* #include <oneapi/mkl/rng/device.hpp> */
 
 #endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_DEVICE_OPENMP)
 
@@ -228,7 +213,6 @@ struct hypre_DeviceData
 #endif
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
-   /* WM: question - what is the device_allocator? */
    hypre_device_allocator            device_allocator;
 #endif
 #if defined(HYPRE_USING_SYCL)
@@ -353,119 +337,7 @@ struct hypre_GpuMatData
 
 #endif //#if defined(HYPRE_USING_GPU)
 
-/* WM: todo - is this how I want to integrate the functionality below? Do I really need all this? */
-/* NOTE: It doesn't line up that nicely with the cuda/hip implementation since you need to pass item agrs */
 #if defined(HYPRE_USING_SYCL)
-/* return the number of work-items in current work-group */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_num_threads(sycl::nd_item<dim>& item)
-{
-  return item.get_group().get_local_linear_range();
-}
-
-/* return the flattened or linearlized work-item id in current work-group (not global)*/
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_thread_id(sycl::nd_item<dim>& item)
-{
-  return item.get_local_linear_id();
-}
-
-/* return the number of sub-groups in current work-group */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_num_warps(sycl::nd_item<dim>& item)
-{
-  return item.get_sub_group().get_group_range().get(0);
-}
-
-/* return the sub_group id in work-group */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_warp_id(sycl::nd_item<dim>& item)
-{
-  return item.get_sub_group().get_group_linear_id();
-}
-
-/* return the work-item lane id in a sub_group */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_lane_id(sycl::nd_item<dim>& item)
-{
-  return hypre_cuda_get_thread_id<dim>(item) & (item.get_sub_group().get_local_range().get(0)-1);
-}
-
-/* return the num of work_groups in nd_range */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_num_blocks(sycl::nd_item<dim>& item)
-{
-  // return item.get_group().get_group_linear_range(); // API available in SYCL 2020
-
-  switch (dim)
-  {
-  case 1:
-    return (item.get_group_range(0));
-  case 2:
-    return (item.get_group_range(0) * item.get_group_range(1));
-  case 3:
-    return (item.get_group_range(0) * item.get_group_range(1) * item.get_group_range(2));
-  }
-
-  return -1;
-}
-
-/* return the flattened or linearlized work-group id in nd_range */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_block_id(sycl::nd_item<dim>& item)
-{
-  return item.get_group_linear_id();
-}
-
-/* return the number of work-items in global iteration space*/
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_grid_num_threads(sycl::nd_item<dim>& item)
-{
-  switch (dim)
-  {
-  case 1:
-    return (item.get_global_range(0));
-  case 2:
-    return (item.get_global_range(0) * item.get_global_range(1));
-  case 3:
-    return (item.get_global_range(0) * item.get_global_range(1) * item.get_global_range(2));
-  }
-
-  return -1;
-}
-
-/* return the flattened work-item id in global iteration space */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_grid_thread_id(sycl::nd_item<dim>& item)
-{
-  return item.get_global_linear_id();
-}
-
-/* return the number of sub-groups in global iteration space */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_grid_num_warps(sycl::nd_item<dim>& item)
-{
-  return hypre_cuda_get_num_blocks<dim>(item) * hypre_cuda_get_num_warps<dim>(item);
-}
-
-/* return the flattened sub-group id in global iteration space */
-template <hypre_int dim>
-static __inline__ __attribute__((always_inline))
-hypre_int hypre_cuda_get_grid_warp_id(sycl::nd_item<dim>& item)
-{
-  return hypre_cuda_get_block_id<dim>(item) * hypre_cuda_get_num_warps<dim>(item) +
-    hypre_cuda_get_warp_id<dim>(item);
-}
 
 /* device_utils.c */
 sycl::range<1> hypre_GetDefaultDeviceBlockDimension();
@@ -515,8 +387,6 @@ using namespace thrust::placeholders;
 #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
 #elif defined(HYPRE_USING_HIP)
 #define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() );  }
-#elif defined(HYPRE_USING_SYCL)
-/* WM: todo? used below in HYPRE_CUDA_LAUNCH2 */
 #endif
 #else // #if defined(HYPRE_DEBUG)
 #define GPU_LAUNCH_SYNC

From 3254e3187c5cf9ece1c430cdca925716a55efa25 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <59661409+abagusetty@users.noreply.github.com>
Date: Thu, 4 Nov 2021 19:49:09 -0500
Subject: [PATCH 32/44] [SYCL] convert sycl::device to sycl::device* for better
 handling  (#504)

* [SYCL] convert sycl::device to sycl::device* for better handling and setting

* [SYCL] fix ONEAPI warning and build issues

* [SYCL] fix the setDevice method and address comments
---
 src/utilities/_hypre_utilities.hpp |  2 +-
 src/utilities/device_utils.c       | 15 ++++---
 src/utilities/device_utils.h       |  2 +-
 src/utilities/general.c            | 64 ++++++++++++++++++++----------
 4 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index 1ba80c8732..b8addbad0b 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -273,7 +273,7 @@ struct hypre_DeviceData
    hypre_device_allocator            device_allocator;
 #endif
 #if defined(HYPRE_USING_SYCL)
-   sycl::device                      device;
+   sycl::device*                     device;
    HYPRE_Int                         device_max_work_group_size;
 #else
    HYPRE_Int                         device;
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index 5747fca82e..b1bb63252b 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -964,14 +964,14 @@ hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i)
             catch (sycl::exception const& ex)
             {
                std::cout << "Caught asynchronous SYCL exception:" << std::endl
-               << ex.what() << ", OpenCL code: " << ex.get_cl_code() << std::endl;
+               << ex.what() << ", SYCL code: " << ex.code() << std::endl;
             }
          }
       };
 
-      sycl::device   syclDev   = data->device;
-      sycl::context  syclctxt  = sycl::context(syclDev, sycl_asynchandler);
-      stream = new sycl::queue(syclctxt, syclDev, sycl::property_list{sycl::property::queue::in_order{}});
+      sycl::device*  syclDev   = data->device;
+      sycl::context  syclctxt  = sycl::context(*syclDev, sycl_asynchandler);
+      stream = new sycl::queue(syclctxt, *syclDev, sycl::property_list{sycl::property::queue::in_order{}});
       data->streams[i] = stream;
    }
 #endif
@@ -1019,7 +1019,7 @@ sycl::queue*
 hypre_DeviceDataComputeStream(hypre_DeviceData *data)
 {
    return hypre_DeviceDataStream(data,
-                                   hypre_DeviceDataComputeStreamNum(data));
+				 hypre_DeviceDataComputeStreamNum(data));
 }
 
 #if defined(HYPRE_USING_CURAND)
@@ -1228,7 +1228,9 @@ hypre_DeviceDataCreate()
    hypre_DeviceData *data = hypre_CTAlloc(hypre_DeviceData, 1, HYPRE_MEMORY_HOST);
 
 #if defined(HYPRE_USING_SYCL)
-   hypre_DeviceDataDevice(data)                 = sycl::device(sycl::default_selector{});
+   /* WM: does the default selector get a GPU if available? Having trouble with getting the device on frank, so temporarily just passing the default selector */
+   hypre_DeviceDataDevice(data)            = nullptr;
+
    hypre_DeviceDataDeviceMaxWorkGroupSize(data) = hypre_DeviceDataDevice(data).get_info<sycl::info::device::max_work_group_size>();
 #else
    hypre_DeviceDataDevice(data)            = 0;
@@ -1486,6 +1488,7 @@ hypre_bind_device( HYPRE_Int myid,
 
    /* get number of devices on this node */
    hypre_GetDeviceCount(&nDevices);
+   /* TODO: ABB might need to look into this since nDevices are overwritten by 1 */
    nDevices = 1;
 
    /* set device */
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index 2350c8c0e6..e4e137ca14 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -216,7 +216,7 @@ struct hypre_DeviceData
    hypre_device_allocator            device_allocator;
 #endif
 #if defined(HYPRE_USING_SYCL)
-   sycl::device                      device;
+   sycl::device*                     device;
    HYPRE_Int                         device_max_work_group_size;
 #else
    HYPRE_Int                         device;
diff --git a/src/utilities/general.c b/src/utilities/general.c
index 8ec1e818e1..0aed7d5252 100644
--- a/src/utilities/general.c
+++ b/src/utilities/general.c
@@ -100,8 +100,35 @@ hypre_SetDevice(hypre_int device_id, hypre_Handle *hypre_handle_)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   /* sycl device set at construction of hypre_DeviceData object */
-#elif defined(HYPRE_USING_GPU)
+   HYPRE_Int nDevices=0;
+   hypre_GetDeviceCount(&nDevices);
+   if (device_id > nDevices) {
+     hypre_printf("ERROR: SYCL device-ID exceed the number of devices on-node... \n");
+   }
+
+   HYPRE_Int local_nDevices=0;
+   for (int i = 0; i < gpu_devices.size(); i++) {
+     // multi-tile GPUs
+     if (gpu_devices[i].get_info<sycl::info::device::partition_max_sub_devices>() > 0) {
+       auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(sycl::info::partition_affinity_domain::numa);
+       for (auto &tile : subDevicesDomainNuma) {
+         if (local_nDevices == device_id) {
+           hypre_HandleDevice(hypre_handle_) = &tile;
+         }
+         local_nDevices++;
+       }
+     }
+     // single-tile GPUs
+     else {
+       if (local_nDevices == device_id) {
+         hypre_HandleDevice(hypre_handle_) = &(gpu_devices[i]);
+       }
+       local_nDevices++;
+     }
+   }
+#endif
+
+#if defined(HYPRE_USING_GPU) && !defined(HYPRE_USING_SYCL)
    if (hypre_handle_)
    {
       hypre_HandleDevice(hypre_handle_) = device_id;
@@ -152,25 +179,20 @@ hypre_GetDeviceCount(hypre_int *device_count)
 #endif
 
 #if defined(HYPRE_USING_SYCL)
-   /* WM: todo - doesn't work on frank... commenting out */
-   /* sycl::platform platform(sycl::gpu_selector{}); */
-   /* auto const& gpu_devices = platform.get_devices(); */
-   /* for (int i = 0; i < gpu_devices.size(); i++) */
-   /* { */
-   /*    if (gpu_devices[i].is_gpu()) */
-   /*    { */
-   /*       if(gpu_devices[i].get_info<sycl::info::device::partition_max_sub_devices>() > 0) */
-   /*       { */
-   /*          auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>( */
-   /*                                      sycl::info::partition_affinity_domain::numa); */
-   /*          (*device_count) += subDevicesDomainNuma.size(); */
-   /*       } */
-   /*       else */
-   /*       { */
-	         /* (*device_count)++; */
-   /*       } */
-   /*    } */
-   /* } */
+   sycl::platform platform(sycl::gpu_selector{});
+   auto const& gpu_devices = platform.get_devices(sycl::info::device_type::gpu);
+   for (int i = 0; i < gpu_devices.size(); i++)
+   {
+     if(gpu_devices[i].get_info<sycl::info::device::partition_max_sub_devices>() > 0)
+     {
+       auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(sycl::info::partition_affinity_domain::numa);
+       (*device_count) += subDevicesDomainNuma.size();
+     }
+     else
+     {
+       (*device_count)++;
+     }
+   }
 #endif
 
    return hypre_error_flag;

From 68fc8be8dde7d244044ca5586dc6182252f744ca Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Tue, 7 Dec 2021 23:32:32 +0000
Subject: [PATCH 33/44] [SYCL] add complex types for device

---
 src/utilities/HYPRE_utilities.h | 10 ++++++++-
 src/utilities/complex.c         | 36 ++++++++++++++++++++++++++-------
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/src/utilities/HYPRE_utilities.h b/src/utilities/HYPRE_utilities.h
index 106e70ae5f..57b3dc11ec 100644
--- a/src/utilities/HYPRE_utilities.h
+++ b/src/utilities/HYPRE_utilities.h
@@ -83,7 +83,15 @@ typedef double HYPRE_Real;
 #endif
 
 #if defined(HYPRE_COMPLEX)
-typedef double _Complex HYPRE_Complex;
+
+#if defined(HYPRE_USING_SYCL)
+  typedef std::complex<double> HYPRE_Complex;
+#elif defined(HYPRE_USING_GPU)
+  typedef thrust::complex<double> HYPRE_Complex;
+#else
+  typedef double _Complex HYPRE_Complex;
+#endif
+
 #define HYPRE_MPI_COMPLEX MPI_C_DOUBLE_COMPLEX  /* or MPI_LONG_DOUBLE ? */
 
 #else  /* default */
diff --git a/src/utilities/complex.c b/src/utilities/complex.c
index eb8dca4f38..ba04d01577 100644
--- a/src/utilities/complex.c
+++ b/src/utilities/complex.c
@@ -9,30 +9,52 @@
 
 #ifdef HYPRE_COMPLEX
 
-#include <complex.h>
-
 HYPRE_Complex
 hypre_conj( HYPRE_Complex value )
 {
-   return conj(value);
+#ifdef HYPRE_USING_SYCL
+  return std::conj(value);
+#elif defined(HYPRE_USING_GPU)
+  return thrust::conj(value);
+#else
+  return conj(value);
+#endif
 }
 
 HYPRE_Real
 hypre_cabs( HYPRE_Complex value )
 {
-   return cabs(value);
+#ifdef HYPRE_USING_SYCL
+  return std::abs(value);
+#elif defined(HYPRE_USING_GPU)
+  return thrust::abs(value);
+#else
+  return cabs(value);
+#endif
 }
 
 HYPRE_Real
 hypre_creal( HYPRE_Complex value )
 {
-   return creal(value);
+#ifdef HYPRE_USING_SYCL
+  return std::real(value);
+#elif defined(HYPRE_USING_GPU)
+  return thrust::real(value);
+#else
+  return creal(value);
+#endif
 }
 
 HYPRE_Real
 hypre_cimag( HYPRE_Complex value )
 {
-   return cimag(value);
+#ifdef HYPRE_USING_SYCL
+  return std::imag(value);
+#elif defined(HYPRE_USING_GPU)
+  return thrust::imag(value);
+#else
+  return cimag(value);
+#endif
 }
 
-#endif
+#endif // HYPRE_COMPLEX

From b7ebf4eddf5643440616a8090bd4d9f98748c079 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Wed, 8 Dec 2021 05:24:16 +0000
Subject: [PATCH 34/44] [SYCL] kernel launch macro

---
 src/IJ_mv/IJMatrix_parcsr_device.c          |    4 +-
 src/IJ_mv/IJVector_parcsr_device.c          |    2 +-
 src/parcsr_ls/ads.c                         |   12 +-
 src/parcsr_ls/ame.c                         |    2 +-
 src/parcsr_ls/ams.c                         |   40 +-
 src/parcsr_ls/par_2s_interp_device.c        |   10 +-
 src/parcsr_ls/par_coarsen_device.c          |    4 +-
 src/parcsr_ls/par_gauss_elim.c              |    2 +-
 src/parcsr_ls/par_indepset_device.c         |    4 +-
 src/parcsr_ls/par_interp_device.c           |    8 +-
 src/parcsr_ls/par_interp_trunc_device.c     |    2 +-
 src/parcsr_ls/par_lr_interp_device.c        |   14 +-
 src/parcsr_ls/par_lr_restr_device.c         |    2 +-
 src/parcsr_ls/par_mod_multi_interp_device.c |   18 +-
 src/parcsr_ls/par_relax_more_device.c       |    2 +-
 src/parcsr_ls/par_strength_device.c         |    4 +-
 src/parcsr_mv/par_csr_matop_device.c        |   12 +-
 src/seq_mv/csr_matop_device.c               | 2704 ++++++++++++++-----
 src/seq_mv/csr_matrix.c                     |    2 +-
 src/seq_mv/csr_spgemm_device_attempt.c      |    6 +-
 src/seq_mv/csr_spgemm_device_confident.c    |    4 +-
 src/seq_mv/csr_spgemm_device_rowbound.c     |    6 +-
 src/seq_mv/csr_spgemm_device_rowest.c       |   12 +-
 src/seq_mv/csr_spgemm_device_util.c         |    4 +-
 src/seq_mv/csr_spmv_device.c                |   10 +-
 src/struct_mv/_hypre_struct_mv.hpp          |    4 +-
 src/struct_mv/boxloop_cuda.h                |    4 +-
 src/utilities/_hypre_utilities.hpp          |  457 +++-
 src/utilities/device_reducer.h              |    2 +-
 src/utilities/device_utils.c                |   24 +-
 src/utilities/device_utils.h                |  455 +++-
 src/utilities/general.c                     |    2 +
 32 files changed, 3053 insertions(+), 785 deletions(-)

diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c
index 1760f3f0db..afedb24f5f 100644
--- a/src/IJ_mv/IJMatrix_parcsr_device.c
+++ b/src/IJ_mv/IJMatrix_parcsr_device.c
@@ -155,7 +155,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix       *matrix,
       /* mark unwanted elements as -1 */
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(len1, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator, (HYPRE_Int *) row_indexes, ncols, indicator );
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator, (HYPRE_Int *) row_indexes, ncols, indicator );
 
       auto new_end = HYPRE_THRUST_CALL(
             copy_if,
@@ -218,7 +218,7 @@ hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int  N0, HYPRE_BigInt  *I0, HYPRE_Big
    /*
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(N0, "thread", bDim);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 );
    */
 
    /* output X: 0: keep, 1: zero-out */
diff --git a/src/IJ_mv/IJVector_parcsr_device.c b/src/IJ_mv/IJVector_parcsr_device.c
index b9afa8c67b..34cb5e8bd0 100644
--- a/src/IJ_mv/IJVector_parcsr_device.c
+++ b/src/IJ_mv/IJVector_parcsr_device.c
@@ -233,7 +233,7 @@ hypre_IJVectorAssembleParDevice(hypre_IJVector *vector)
       /* set/add to local vector */
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(new_nnz, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i, vec_start, new_sora,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i, vec_start, new_sora,
                          hypre_VectorData(hypre_ParVectorLocalVector(par_vector)) );
 
       hypre_TFree(new_i,    HYPRE_MEMORY_DEVICE);
diff --git a/src/parcsr_ls/ads.c b/src/parcsr_ls/ads.c
index e8e87b9047..c6cdf716e1 100644
--- a/src/parcsr_ls/ads.c
+++ b/src/parcsr_ls/ads.c
@@ -576,12 +576,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nnz, "thread", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                F2V_diag_nnz, 3, F2V_diag_J, Pi_diag_J );
 
             gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
                                F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data,
                                Pi_diag_data );
          }
@@ -638,12 +638,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nnz, "thread", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                F2V_offd_nnz, 3, F2V_offd_J, Pi_offd_J );
 
             gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
                                F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data,
                                Pi_offd_data );
          }
@@ -846,7 +846,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data,
                                Pix_diag_data, Piy_diag_data, Piz_diag_data );
          }
@@ -926,7 +926,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data,
                                Pix_offd_data, Piy_offd_data, Piz_offd_data );
          }
diff --git a/src/parcsr_ls/ame.c b/src/parcsr_ls/ame.c
index eea0c6f9ae..e23f025e82 100644
--- a/src/parcsr_ls/ame.c
+++ b/src/parcsr_ls/ame.c
@@ -467,7 +467,7 @@ HYPRE_Int hypre_AMESetup(void *esolver)
          {
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim);
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim,
                                nv, GtdI, GtdJ, GtdA, GtoI, GtoJ, GtoA, edge_bc, offd_edge_bc );
          }
          else
diff --git a/src/parcsr_ls/ams.c b/src/parcsr_ls/ams.c
index 01fe07450d..470879835b 100644
--- a/src/parcsr_ls/ams.c
+++ b/src/parcsr_ls/ams.c
@@ -192,7 +192,7 @@ HYPRE_Int hypre_ParVectorBlockSplit(hypre_ParVector *x,
    {
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
                          size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
    }
    else
@@ -235,7 +235,7 @@ HYPRE_Int hypre_ParVectorBlockGather(hypre_ParVector *x,
    {
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
                          size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
    }
    else
@@ -436,7 +436,7 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A)
    bDim = hypre_GetDefaultDeviceBlockDimension();
    gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
+   HYPRE_GPU_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
                      nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd);
 
    //hypre_SyncCudaComputeStream(hypre_handle());
@@ -763,7 +763,7 @@ HYPRE_Int hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix *A, HYPRE_Real d)
    {
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
                          num_rows, A_diag_I, A_diag_J, A_diag_data, A_offd_I, num_cols_offd, d);
    }
    else
@@ -1539,12 +1539,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                G_diag_nnz, dim, G_diag_J, Pi_diag_J );
 
             gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
                                Pi_diag_data );
          }
@@ -1604,12 +1604,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                G_offd_nnz, dim, G_offd_J, Pi_offd_J );
 
             gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
                                Pi_offd_data );
          }
@@ -1838,7 +1838,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
                                Pix_diag_data, Piy_diag_data, Piz_diag_data );
          }
@@ -1904,7 +1904,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, NULL,
                                Pix_diag_data, Piy_diag_data, NULL );
          }
@@ -1962,7 +1962,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, NULL, NULL,
                                Pix_diag_data, NULL, NULL );
          }
@@ -2039,7 +2039,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
                                Pix_offd_data, Piy_offd_data, Piz_offd_data );
          }
@@ -2121,7 +2121,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, NULL,
                                Pix_offd_data, Piy_offd_data, NULL );
          }
@@ -2193,7 +2193,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, NULL, NULL,
                                Pix_offd_data, NULL, NULL );
          }
@@ -2385,12 +2385,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                G_diag_nnz, dim, G_diag_J, GPi_diag_J );
 
             gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
                                GPi_diag_data );
          }
@@ -2451,12 +2451,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                G_offd_nnz, dim, G_offd_J, GPi_offd_J );
 
             gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
                                GPi_offd_data );
          }
@@ -2681,7 +2681,7 @@ HYPRE_Int hypre_AMSSetup(void *solver,
          {
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim);
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
                                nv, G0tdI, G0tdA, G0toI, G0toA, interior_nodes_data );
          }
          else
@@ -3246,7 +3246,7 @@ HYPRE_Int hypre_AMSSetup(void *solver,
                   {
                      dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
                      dim3 gDim = hypre_GetDefaultDeviceGridDimension(Gt_num_rows, "warp", bDim);
-                     HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
+                     HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
                            Gt_num_rows, Gt_diag_I, Gt_diag_J, Gt_diag_data, Gt_offd_I, Gt_offd_data,
                            Gx_data, Gy_data, Gz_data );
                   }
diff --git a/src/parcsr_ls/par_2s_interp_device.c b/src/parcsr_ls/par_2s_interp_device.c
index 15a497a04b..7e602d19ca 100644
--- a/src/parcsr_ls/par_2s_interp_device.c
+++ b/src/parcsr_ls/par_2s_interp_device.c
@@ -93,7 +93,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix  *A,
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim);
 
    /* only for rows corresponding to F2 (notice flag == -1) */
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
                       A_nr_local,
                       A_offd_nnz > 0,
@@ -144,7 +144,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix  *A,
     * diagnoally scale As_F2F (from both sides) and replace the diagonal */
    gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF,
                       gDim, bDim,
                       AF2F_nr_local,
                       hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)),
@@ -312,7 +312,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix  *A,
    dlam = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE);
    dtmp = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
                       gDim, bDim,
                       AFC_nr_local,
                       hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_FF)),
@@ -367,7 +367,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix  *A,
    gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim);
 
    /* only for rows corresponding to F2 (notice flag == -1) */
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
                       A_nr_local,
                       A_offd_nnz > 0,
@@ -417,7 +417,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix  *A,
     * diagnoally scale As_F2F (from both sides) and replace the diagonal */
    gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF,
                       gDim, bDim,
                       AF2F_nr_local,
                       hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)),
diff --git a/src/parcsr_ls/par_coarsen_device.c b/src/parcsr_ls/par_coarsen_device.c
index 6c30741003..cadf8d13d0 100644
--- a/src/parcsr_ls/par_coarsen_device.c
+++ b/src/parcsr_ls/par_coarsen_device.c
@@ -324,7 +324,7 @@ hypre_PMISCoarseningInitDevice( hypre_ParCSRMatrix  *S,               /* in */
    HYPRE_Int *new_end;
 
    /* init CF_marker_diag and measure_diag: remove some special nodes */
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim,
                       num_rows_diag, CF_init, S_diag_i, S_offd_i, measure_diag, CF_marker_diag );
 
    /* communicate for measure_offd */
@@ -487,7 +487,7 @@ hypre_PMISCoarseningUpdateCFDevice( hypre_ParCSRMatrix  *S,               /* in
    bDim = hypre_GetDefaultDeviceBlockDimension();
    gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF,
                       gDim, bDim,
                       graph_diag_size,
                       graph_diag,
diff --git a/src/parcsr_ls/par_gauss_elim.c b/src/parcsr_ls/par_gauss_elim.c
index 2a8c9f6189..d3612b6a69 100644
--- a/src/parcsr_ls/par_gauss_elim.c
+++ b/src/parcsr_ls/par_gauss_elim.c
@@ -420,7 +420,7 @@ HYPRE_Int hypre_dgemv_device(HYPRE_Int m, HYPRE_Int n, HYPRE_Int lda, HYPRE_Real
    dim3 bDim(BLOCK_SIZE, 1, 1);
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(m, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y );
 
    return hypre_error_flag;
 }
diff --git a/src/parcsr_ls/par_indepset_device.c b/src/parcsr_ls/par_indepset_device.c
index bfebafebc1..6ddea58cd2 100644
--- a/src/parcsr_ls/par_indepset_device.c
+++ b/src/parcsr_ls/par_indepset_device.c
@@ -170,7 +170,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix  *S,
    bDim = hypre_GetDefaultDeviceBlockDimension();
    gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim,
                       graph_diag_size, graph_diag, measure_diag, measure_offd,
                       S_diag_i, S_diag_j, S_offd_i, S_offd_j,
                       IS_marker_diag, IS_marker_offd, IS_offd_temp_mark );
@@ -186,7 +186,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix  *S,
    /* adjust IS_marker_diag from the received */
    gDim = hypre_GetDefaultDeviceGridDimension(num_elmts_send, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim,
                       IS_marker_diag, num_elmts_send, send_map_elmts,
                       int_send_buf, IS_offd_temp_mark );
 
diff --git a/src/parcsr_ls/par_interp_device.c b/src/parcsr_ls/par_interp_device.c
index 8a2d4dc0cd..bd410cce36 100644
--- a/src/parcsr_ls/par_interp_device.c
+++ b/src/parcsr_ls/par_interp_device.c
@@ -178,7 +178,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix   *A,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim,
                       n_fine, S_diag_i, S_diag_j, S_offd_i, S_offd_j,
                       CF_marker, CF_marker_offd, num_functions,
                       dof_func_dev, dof_func_offd, P_diag_i, P_offd_i);
@@ -209,7 +209,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix   *A,
 
    if (interp_type == 3)
    {
-      HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim,
                          n_fine, A_diag_i, A_diag_j, A_diag_data,
                          A_offd_i, A_offd_j, A_offd_data,
                          hypre_ParCSRMatrixSocDiagJ(S),
@@ -222,7 +222,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix   *A,
    }
    else
    {
-      HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim,
                          n_fine, A_diag_i, A_diag_j, A_diag_data,
                          A_offd_i, A_offd_j, A_offd_data,
                          hypre_ParCSRMatrixSocDiagJ(S),
@@ -1127,7 +1127,7 @@ hypre_BoomerAMGBuildInterpOnePntDevice( hypre_ParCSRMatrix  *A,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim,
                       n_fine, A_diag_i, A_strong_diag_j, A_diag_a, A_offd_i, A_strong_offd_j,
                       A_offd_a, CF_marker, CF_marker_offd, diag_compress_marker,
                       offd_compress_marker, P_diag_i, P_diag_j_temp, P_offd_i, P_offd_j_temp);
diff --git a/src/parcsr_ls/par_interp_trunc_device.c b/src/parcsr_ls/par_interp_trunc_device.c
index 4524f91f9e..6b6d7a8d3f 100644
--- a/src/parcsr_ls/par_interp_trunc_device.c
+++ b/src/parcsr_ls/par_interp_trunc_device.c
@@ -159,7 +159,7 @@ hypre_BoomerAMGInterpTruncationDevice( hypre_ParCSRMatrix *P, HYPRE_Real trunc_f
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim,
                       nrows, trunc_factor, max_elmts, P_rowptr, P_j, P_a );
 
    /* build new P_diag and P_offd */
diff --git a/src/parcsr_ls/par_lr_interp_device.c b/src/parcsr_ls/par_lr_interp_device.c
index 43ac592e95..2587a9298e 100644
--- a/src/parcsr_ls/par_lr_interp_device.c
+++ b/src/parcsr_ls/par_lr_interp_device.c
@@ -69,7 +69,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix  *A,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
                       A_nr_of_rows,
                       A_offd_nnz > 0,
@@ -110,7 +110,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix  *A,
    /* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */
    hypre_GpuProfilingPushRange("Compute interp matrix");
    gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_aff_afc,
                       gDim, bDim,
                       W_nr_of_rows,
                       hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)),
@@ -255,7 +255,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix  *A,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp",   bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
                       A_nr_of_rows,
                       A_offd_nnz > 0,
@@ -330,7 +330,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix  *A,
 
    hypre_GpuProfilingPushRange("Compute interp matrix");
    gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_twiaff_w,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_twiaff_w,
                       gDim, bDim,
                       W_nr_of_rows,
                       hypre_ParCSRMatrixFirstRowIndex(AFF),
@@ -480,7 +480,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix  *A,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
                       A_nr_of_rows,
                       A_offd_nnz > 0,
@@ -523,7 +523,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix  *A,
    dtmp = hypre_TAlloc(HYPRE_Complex, W_nr_of_rows, HYPRE_MEMORY_DEVICE);
    hypre_GpuProfilingPushRange("Compute D_tmp");
    gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
                       gDim, bDim,
                       W_nr_of_rows,
                       hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)),
@@ -563,7 +563,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix  *A,
    /* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */
    hypre_GpuProfilingPushRange("Compute interp matrix");
    gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe,
                       gDim, bDim,
                       W_nr_of_rows,
                       hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)),
diff --git a/src/parcsr_ls/par_lr_restr_device.c b/src/parcsr_ls/par_lr_restr_device.c
index 104ec87451..60c2c4894c 100644
--- a/src/parcsr_ls/par_lr_restr_device.c
+++ b/src/parcsr_ls/par_lr_restr_device.c
@@ -247,7 +247,7 @@ hypre_BoomerAMGBuildRestrNeumannAIRDevice( hypre_ParCSRMatrix   *A,
    /* assemble the diagonal part of R from Z */
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
-   HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim,
                       n_cpts, Fmap, Cmap, Z_diag_i, Z_diag_j, Z_diag_a, R_diag_i, R_diag_j, R_diag_a);
 
    num_cols_offd_R = num_cols_offd_Z;
diff --git a/src/parcsr_ls/par_mod_multi_interp_device.c b/src/parcsr_ls/par_mod_multi_interp_device.c
index 3a62f6b813..25cbf2c9b3 100644
--- a/src/parcsr_ls/par_mod_multi_interp_device.c
+++ b/src/parcsr_ls/par_mod_multi_interp_device.c
@@ -309,7 +309,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix  *A,
          dim3 gDim = hypre_GetDefaultCUDAGridDimension(remaining, "warp", bDim);
 
          /* output diag_shifts is 0/1 indicating if points_left_dev[i] is picked in this pass */
-         HYPRE_CUDA_LAUNCH( hypreCUDAKernel_pass_order_count,
+         HYPRE_GPU_LAUNCH( hypreCUDAKernel_pass_order_count,
                             gDim, bDim,
                             remaining,
                             current_pass,
@@ -403,7 +403,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultCUDABlockDimension();
       dim3 gDim = hypre_GetDefaultCUDAGridDimension(n_fine, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_cfmarker_masked_rowsum, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_cfmarker_masked_rowsum, gDim, bDim,
                          n_fine, A_diag_i, A_diag_j, A_diag_data,
                          A_offd_i, A_offd_j, A_offd_data,
                          CF_marker,
@@ -555,7 +555,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultCUDABlockDimension();
       dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_points, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_insert_remaining_weights, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_insert_remaining_weights, gDim, bDim,
                          pass_starts[p + 1], pass_starts[p + 2], pass_order,
                          Pi_diag_i, Pi_diag_j, Pi_diag_data,
                          P_diag_i, P_diag_j, P_diag_data,
@@ -618,7 +618,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix  *A,
          dim3 bDim = hypre_GetDefaultCUDABlockDimension();
          dim3 gDim = hypre_GetDefaultCUDAGridDimension(npoints, "warp", bDim);
 
-         HYPRE_CUDA_LAUNCH( hypreCUDAKernel_populate_big_P_offd_j, gDim, bDim,
+         HYPRE_GPU_LAUNCH( hypreCUDAKernel_populate_big_P_offd_j, gDim, bDim,
                             pass_starts[p + 1],
                             pass_starts[p + 2],
                             pass_order,
@@ -853,7 +853,7 @@ hypre_GenerateMultipassPiDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultCUDABlockDimension();
       dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_points, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
                          num_points, color, pass_order, pass_marker, pass_marker_offd,
                          S_diag_i, S_diag_j, S_offd_i, S_offd_j,
                          P_diag_i, P_offd_i );
@@ -879,7 +879,7 @@ hypre_GenerateMultipassPiDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultCUDABlockDimension();
       dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_points, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_j_Poffd_j, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_j_Poffd_j, gDim, bDim,
                          num_points,
                          color,
                          pass_order,
@@ -1101,7 +1101,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultCUDABlockDimension();
       dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_points, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
                          num_points, color, pass_order, pass_marker, pass_marker_offd,
                          S_diag_i, S_diag_j, S_offd_i, S_offd_j,
                          Q_diag_i, Q_offd_i );
@@ -1128,7 +1128,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultCUDABlockDimension();
       dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_points, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Qdiag_j_Qoffd_j, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Qdiag_j_Qoffd_j, gDim, bDim,
                          num_points,
                          color,
                          pass_order,
@@ -1199,7 +1199,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultCUDABlockDimension();
       dim3 gDim = hypre_GetDefaultCUDAGridDimension(num_points, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_mutli_pi_rowsum, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_mutli_pi_rowsum, gDim, bDim,
                          num_points, pass_order, A_diag_i, A_diag_data,
                          Pi_diag_i, Pi_diag_data, Pi_offd_i, Pi_offd_data,
                          w_row_sum );
diff --git a/src/parcsr_ls/par_relax_more_device.c b/src/parcsr_ls/par_relax_more_device.c
index 657905f3d9..f0a994b634 100644
--- a/src/parcsr_ls/par_relax_more_device.c
+++ b/src/parcsr_ls/par_relax_more_device.c
@@ -153,7 +153,7 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A,
 
    bDim = hypre_GetDefaultDeviceBlockDimension();
    gDim = hypre_GetDefaultDeviceGridDimension(A_num_rows, "warp", bDim);
-   HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate,
+   HYPRE_GPU_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate,
                      gDim,
                      bDim,
                      A_num_rows,
diff --git a/src/parcsr_ls/par_strength_device.c b/src/parcsr_ls/par_strength_device.c
index a2ca43fc8e..a63b8bd2b4 100644
--- a/src/parcsr_ls/par_strength_device.c
+++ b/src/parcsr_ls/par_strength_device.c
@@ -139,7 +139,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix    *A,
 
    if (abs_soc)
    {
-      HYPRE_CUDA_LAUNCH( hypre_BoomerAMGCreateSabs_rowcount, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_BoomerAMGCreateSabs_rowcount, gDim, bDim,
                          num_variables, max_row_sum, strength_threshold,
                          A_diag_data, A_diag_i, A_diag_j,
                          A_offd_data, A_offd_i, A_offd_j,
@@ -149,7 +149,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix    *A,
    }
    else
    {
-      HYPRE_CUDA_LAUNCH( hypre_BoomerAMGCreateS_rowcount, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_BoomerAMGCreateS_rowcount, gDim, bDim,
                          num_variables, max_row_sum, strength_threshold,
                          A_diag_data, A_diag_i, A_diag_j,
                          A_offd_data, A_offd_i, A_offd_j,
diff --git a/src/parcsr_mv/par_csr_matop_device.c b/src/parcsr_mv/par_csr_matop_device.c
index 251e28d3a6..67aa26bcc7 100644
--- a/src/parcsr_mv/par_csr_matop_device.c
+++ b/src/parcsr_mv/par_csr_matop_device.c
@@ -620,7 +620,7 @@ hypre_ConcatDiagAndOffdDevice(hypre_ParCSRMatrix *A)
    const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    const dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
                       gDim, bDim,
                       hypre_CSRMatrixNumRows(A_diag),
                       hypre_CSRMatrixNumCols(A_diag),
@@ -735,7 +735,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_ParCSRMatrixNumRows(A), "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
                       gDim, bDim,
                       hypre_CSRMatrixNumRows(A_diag),
                       hypre_CSRMatrixNumCols(A_diag),
@@ -765,7 +765,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A,
 
    hypre_assert(hypre_CSRMatrixNumCols(E_diag) == hypre_CSRMatrixNumCols(A_diag));
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
                       gDim, bDim,
                       hypre_CSRMatrixNumRows(E_diag),
                       hypre_CSRMatrixNumCols(E_diag),
@@ -1197,21 +1197,21 @@ hypre_ParCSRMatrixDropSmallEntriesDevice( hypre_ParCSRMatrix *A,
 
    if (type == -1)
    {
-      HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<-1>, gDim, bDim, 
+      HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<-1>, gDim, bDim, 
                          hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag), 
                          hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd), 
                          hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd);
    }
    if (type == 1)
    {
-      HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<1>, gDim, bDim, 
+      HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<1>, gDim, bDim, 
                          hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag), 
                          hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd), 
                          hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd);
    }
    if (type == 2)
    {
-      HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<2>, gDim, bDim, 
+      HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<2>, gDim, bDim, 
                          hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag), 
                          hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd), 
                          hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd);
diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c
index d4201d5a26..e76835cecb 100644
--- a/src/seq_mv/csr_matop_device.c
+++ b/src/seq_mv/csr_matop_device.c
@@ -109,6 +109,782 @@ hypre_GpuMatDataDestroy(hypre_GpuMatData *data)
 
 #endif /* #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) */
 
+/* ABB: All the compute kernel implementations are grouped here */
+#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+
+__global__ void
+hypreGPUKernel_CSRMoveDiagFirst( HYPRE_Int      nrows,
+                                  HYPRE_Int     *ia,
+                                  HYPRE_Int     *ja,
+                                  HYPRE_Complex *aa )
+{
+   HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+
+   if (row >= nrows)
+   {
+      return;
+   }
+
+   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
+   HYPRE_Int p = 0, q = 0;
+
+   if (lane < 2)
+   {
+      p = read_only_load(ia + row + lane);
+   }
+   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+
+   for (HYPRE_Int j = p + lane + 1; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+   {
+      hypre_int find_diag = j < q && ja[j] == row;
+
+      if (find_diag)
+      {
+         ja[j] = ja[p];
+         ja[p] = row;
+         HYPRE_Complex tmp = aa[p];
+         aa[p] = aa[j];
+         aa[j] = tmp;
+      }
+
+      if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+      {
+         break;
+      }
+   }
+}
+
+/* check if diagonal entry is the first one at each row
+ * Return: the number of rows that do not have the first entry as diagonal
+ * RL: only check if it's a non-empty row
+ */
+__global__ void
+hypreGPUKernel_CSRCheckDiagFirst( HYPRE_Int  nrows,
+                                   HYPRE_Int *ia,
+                                   HYPRE_Int *ja,
+                                   HYPRE_Int *result )
+{
+   const HYPRE_Int row = hypre_cuda_get_grid_thread_id<1,1>();
+   if (row < nrows)
+   {
+      result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row);
+   }
+}
+
+__global__ void
+hypreGPUKernel_CSRMatrixFixZeroDiagDevice( HYPRE_Complex  v,
+                                            HYPRE_Int      nrows,
+                                            HYPRE_Int     *ia,
+                                            HYPRE_Int     *ja,
+                                            HYPRE_Complex *data,
+                                            HYPRE_Real     tol,
+                                            HYPRE_Int     *result )
+{
+   const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+
+   if (row >= nrows)
+   {
+      return;
+   }
+
+   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
+   HYPRE_Int p = 0, q = 0;
+   bool has_diag = false;
+
+   if (lane < 2)
+   {
+      p = read_only_load(ia + row + lane);
+   }
+   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+
+   for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+   {
+      hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
+
+      if (find_diag)
+      {
+         if (fabs(data[j]) <= tol)
+         {
+            data[j] = v;
+         }
+      }
+
+      if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+      {
+         has_diag = true;
+         break;
+      }
+   }
+
+   if (result && !has_diag && lane == 0)
+   {
+      result[row] = 1;
+   }
+}
+
+__global__ void
+hypreGPUKernel_CSRMatrixReplaceDiagDevice( HYPRE_Complex *new_diag,
+                                            HYPRE_Complex  v,
+                                            HYPRE_Int      nrows,
+                                            HYPRE_Int     *ia,
+                                            HYPRE_Int     *ja,
+                                            HYPRE_Complex *data,
+                                            HYPRE_Real     tol,
+                                            HYPRE_Int     *result )
+{
+   const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+
+   if (row >= nrows)
+   {
+      return;
+   }
+
+   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
+   HYPRE_Int p = 0, q = 0;
+   bool has_diag = false;
+
+   if (lane < 2)
+   {
+      p = read_only_load(ia + row + lane);
+   }
+   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+
+   for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+   {
+      hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
+
+      if (find_diag)
+      {
+         HYPRE_Complex d = read_only_load(&new_diag[row]);
+         if (fabs(d) <= tol)
+         {
+            d = v;
+         }
+         data[j] = d;
+      }
+
+      if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+      {
+         has_diag = true;
+         break;
+      }
+   }
+
+   if (result && !has_diag && lane == 0)
+   {
+      result[row] = 1;
+   }
+}
+
+/* type == 0, sum,
+ *         1, abs sum (l-1)
+ *         2, square sum (l-2)
+ */
+template<HYPRE_Int type>
+__global__ void
+hypreGPUKernel_CSRRowSum( HYPRE_Int      nrows,
+                           HYPRE_Int     *ia,
+                           HYPRE_Int     *ja,
+                           HYPRE_Complex *aa,
+                           HYPRE_Int     *CF_i,
+                           HYPRE_Int     *CF_j,
+                           HYPRE_Complex *row_sum,
+                           HYPRE_Complex  scal,
+                           HYPRE_Int      set)
+{
+   HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>();
+
+   if (row_i >= nrows)
+   {
+      return;
+   }
+
+   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
+   HYPRE_Int p = 0, q = 0;
+
+   if (lane < 2)
+   {
+      p = read_only_load(ia + row_i + lane);
+   }
+
+   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+
+   HYPRE_Complex row_sum_i = 0.0;
+
+   for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) {
+      if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) )
+      {
+         continue;
+      }
+
+      HYPRE_Complex aii = aa[j];
+
+      if (type == 0)
+      {
+         row_sum_i += aii;
+      }
+      else if (type == 1)
+      {
+         row_sum_i += fabs(aii);
+      }
+      else if (type == 2)
+      {
+         row_sum_i += aii * aii;
+      }
+   }
+
+   row_sum_i = warp_reduce_sum(row_sum_i);
+
+   if (lane == 0)
+   {
+      if (set)
+      {
+         row_sum[row_i] = scal * row_sum_i;
+      }
+      else
+      {
+         row_sum[row_i] += scal * row_sum_i;
+      }
+   }
+}
+
+/* type 0: diag
+ *      1: abs diag
+ *      2: diag inverse
+ *      3: diag inverse sqrt
+ *      4: abs diag inverse sqrt
+ */
+__global__ void
+hypreGPUKernel_CSRExtractDiag( HYPRE_Int      nrows,
+                                HYPRE_Int     *ia,
+                                HYPRE_Int     *ja,
+                                HYPRE_Complex *aa,
+                                HYPRE_Complex *d,
+                                HYPRE_Int      type)
+{
+   HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+
+   if (row >= nrows)
+   {
+      return;
+   }
+
+   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
+   HYPRE_Int p = 0, q = 0;
+
+   if (lane < 2)
+   {
+      p = read_only_load(ia + row + lane);
+   }
+   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+
+   HYPRE_Int has_diag = 0;
+
+   for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+   {
+      hypre_int find_diag = j < q && ja[j] == row;
+
+      if (find_diag)
+      {
+         if (type == 0)
+         {
+            d[row] = aa[j];
+         }
+         else if (type == 1)
+         {
+            d[row] = fabs(aa[j]);
+         }
+         else if (type == 2)
+         {
+            d[row] = 1.0 / aa[j];
+         }
+         else if (type == 3)
+         {
+            d[row] = 1.0 / sqrt(aa[j]);
+         }
+         else if (type == 4)
+         {
+            d[row] = 1.0 / sqrt(fabs(aa[j]));
+         }
+      }
+
+      if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+      {
+         has_diag = 1;
+         break;
+      }
+   }
+
+   if (!has_diag && lane == 0)
+   {
+      d[row] = 0.0;
+   }
+}
+
+/* mark is of size nA
+ * diag_option: 1: special treatment for diag entries, mark as -2
+ */
+__global__ void
+hypreGPUKernel_CSRMatrixIntersectPattern(HYPRE_Int  n,
+                                          HYPRE_Int  nA,
+                                          HYPRE_Int *rowid,
+                                          HYPRE_Int *colid,
+                                          HYPRE_Int *idx,
+                                          HYPRE_Int *mark,
+                                          HYPRE_Int  diag_option)
+{
+   HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
+
+   if (i >= n)
+   {
+      return;
+   }
+
+   HYPRE_Int r1 = read_only_load(&rowid[i]);
+   HYPRE_Int c1 = read_only_load(&colid[i]);
+   HYPRE_Int j = read_only_load(&idx[i]);
+
+   if (0 == diag_option)
+   {
+      if (j < nA)
+      {
+         HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
+         HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
+         if (r1 == r2 && c1 == c2)
+         {
+            mark[j] = c1;
+         }
+         else
+         {
+            mark[j] = -1;
+         }
+      }
+   }
+   else if (1 == diag_option)
+   {
+      if (j < nA)
+      {
+         if (r1 == c1)
+         {
+            mark[j] = -2;
+         }
+         else
+         {
+            HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
+            HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
+            if (r1 == r2 && c1 == c2)
+            {
+               mark[j] = c1;
+            }
+            else
+            {
+               mark[j] = -1;
+            }
+         }
+      }
+   }
+}
+
+#elif defined(HYPRE_USING_SYCL)
+
+void
+hypreGPUKernel_CSRMoveDiagFirst( sycl::nd_item<1>& item,
+                                 HYPRE_Int      nrows,
+                                 HYPRE_Int     *ia,
+                                 HYPRE_Int     *ja,
+                                 HYPRE_Complex *aa )
+{
+  HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item);
+
+  if (row >= nrows)
+    {
+      return;
+    }
+
+  HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item);
+  HYPRE_Int p = 0, q = 0;
+
+  if (lane < 2)
+    {
+      p = read_only_load(ia + row + lane);
+    }
+  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
+
+  for (HYPRE_Int j = p + lane + 1; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE)
+    {
+      hypre_int find_diag = j < q && ja[j] == row;
+
+      if (find_diag)
+        {
+          ja[j] = ja[p];
+          ja[p] = row;
+          HYPRE_Complex tmp = aa[p];
+          aa[p] = aa[j];
+          aa[j] = tmp;
+        }
+
+      if ( sycl::any_of_group(SG, find_diag) )
+        {
+          break;
+        }
+    }
+}
+
+/* check if diagonal entry is the first one at each row
+ * Return: the number of rows that do not have the first entry as diagonal
+ * RL: only check if it's a non-empty row
+ */
+void
+hypreGPUKernel_CSRCheckDiagFirst( sycl::nd_item<1>& item,
+                                  HYPRE_Int  nrows,
+                                  HYPRE_Int *ia,
+                                  HYPRE_Int *ja,
+                                  HYPRE_Int *result )
+{
+  const HYPRE_Int row = hypre_gpu_get_grid_thread_id<1,1>(item);
+  if (row < nrows)
+    {
+      result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row);
+    }
+}
+
+void
+hypreGPUKernel_CSRMatrixFixZeroDiagDevice( sycl::nd_item<1>& item,
+                                           HYPRE_Complex  v,
+                                           HYPRE_Int      nrows,
+                                           HYPRE_Int     *ia,
+                                           HYPRE_Int     *ja,
+                                           HYPRE_Complex *data,
+                                           HYPRE_Real     tol,
+                                           HYPRE_Int     *result )
+{
+  const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item);
+
+  if (row >= nrows)
+    {
+      return;
+    }
+
+  HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item);
+  HYPRE_Int p = 0, q = 0;
+  bool has_diag = false;
+
+  if (lane < 2)
+    {
+      p = read_only_load(ia + row + lane);
+    }
+  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
+
+  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE)
+    {
+      hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
+
+      if (find_diag)
+        {
+          if (fabs(data[j]) <= tol)
+            {
+              data[j] = v;
+            }
+        }
+
+      if ( sycl::any_of_group(SG, find_diag) )
+        {
+          has_diag = true;
+          break;
+        }
+    }
+
+  if (result && !has_diag && lane == 0)
+    {
+      result[row] = 1;
+    }
+}
+
+void
+hypreGPUKernel_CSRMatrixReplaceDiagDevice( sycl::nd_item<1>& item,
+                                           HYPRE_Complex *new_diag,
+                                           HYPRE_Complex  v,
+                                           HYPRE_Int      nrows,
+                                           HYPRE_Int     *ia,
+                                           HYPRE_Int     *ja,
+                                           HYPRE_Complex *data,
+                                           HYPRE_Real     tol,
+                                           HYPRE_Int     *result )
+{
+  const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item);
+
+  if (row >= nrows)
+    {
+      return;
+    }
+
+  HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item);
+  HYPRE_Int p = 0, q = 0;
+  bool has_diag = false;
+
+  if (lane < 2)
+    {
+      p = read_only_load(ia + row + lane);
+    }
+  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
+
+  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE)
+    {
+      hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
+
+      if (find_diag)
+        {
+          HYPRE_Complex d = read_only_load(&new_diag[row]);
+          if (fabs(d) <= tol)
+            {
+              d = v;
+            }
+          data[j] = d;
+        }
+
+      if ( sycl::any_of_group(SG, find_diag) )
+        {
+          has_diag = true;
+          break;
+        }
+    }
+
+  if (result && !has_diag && lane == 0)
+    {
+      result[row] = 1;
+    }
+}
+
+/* type == 0, sum,
+ *         1, abs sum (l-1)
+ *         2, square sum (l-2)
+ */
+template<HYPRE_Int type>
+void
+hypreGPUKernel_CSRRowSum( sycl::nd_item<1>& item,
+                          HYPRE_Int      nrows,
+                          HYPRE_Int     *ia,
+                          HYPRE_Int     *ja,
+                          HYPRE_Complex *aa,
+                          HYPRE_Int     *CF_i,
+                          HYPRE_Int     *CF_j,
+                          HYPRE_Complex *row_sum,
+                          HYPRE_Complex  scal,
+                          HYPRE_Int      set)
+{
+  HYPRE_Int row_i = hypre_gpu_get_grid_warp_id<1,1>(item);
+
+  if (row_i >= nrows)
+    {
+      return;
+    }
+
+  HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item);
+  HYPRE_Int p = 0, q = 0;
+
+  if (lane < 2)
+    {
+      p = read_only_load(ia + row_i + lane);
+    }
+
+  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
+
+  HYPRE_Complex row_sum_i = 0.0;
+
+  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE) {
+    if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) )
+      {
+        continue;
+      }
+
+    HYPRE_Complex aii = aa[j];
+
+    if (type == 0)
+      {
+        row_sum_i += aii;
+      }
+    else if (type == 1)
+      {
+        row_sum_i += fabs(aii);
+      }
+    else if (type == 2)
+      {
+        row_sum_i += aii * aii;
+      }
+  }
+
+  row_sum_i = warp_reduce_sum(row_sum_i, item);
+
+  if (lane == 0)
+    {
+      if (set)
+        {
+          row_sum[row_i] = scal * row_sum_i;
+        }
+      else
+        {
+          row_sum[row_i] += scal * row_sum_i;
+        }
+    }
+}
+
+/* type 0: diag
+ *      1: abs diag
+ *      2: diag inverse
+ *      3: diag inverse sqrt
+ *      4: abs diag inverse sqrt
+ */
+void
+hypreGPUKernel_CSRExtractDiag( sycl::nd_item<1>& item,
+                               HYPRE_Int      nrows,
+                               HYPRE_Int     *ia,
+                               HYPRE_Int     *ja,
+                               HYPRE_Complex *aa,
+                               HYPRE_Complex *d,
+                               HYPRE_Int      type)
+{
+  HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item);
+
+  if (row >= nrows)
+    {
+      return;
+    }
+
+  HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item);
+  HYPRE_Int p = 0, q = 0;
+
+  if (lane < 2)
+    {
+      p = read_only_load(ia + row + lane);
+    }
+  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
+
+  HYPRE_Int has_diag = 0;
+
+  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE)
+    {
+      hypre_int find_diag = j < q && ja[j] == row;
+
+      if (find_diag)
+        {
+          if (type == 0)
+            {
+              d[row] = aa[j];
+            }
+          else if (type == 1)
+            {
+              d[row] = fabs(aa[j]);
+            }
+          else if (type == 2)
+            {
+              d[row] = 1.0 / aa[j];
+            }
+          else if (type == 3)
+            {
+              d[row] = 1.0 / sqrt(aa[j]);
+            }
+          else if (type == 4)
+            {
+              d[row] = 1.0 / sqrt(fabs(aa[j]));
+            }
+        }
+
+      if ( sycl::any_of_group(SG, find_diag) )
+        {
+          has_diag = 1;
+          break;
+        }
+    }
+
+  if (!has_diag && lane == 0)
+    {
+      d[row] = 0.0;
+    }
+}
+
+/* mark is of size nA
+ * diag_option: 1: special treatment for diag entries, mark as -2
+ */
+void
+hypreGPUKernel_CSRMatrixIntersectPattern( sycl::nd_item<1>& item,
+                                          HYPRE_Int  n,
+                                          HYPRE_Int  nA,
+                                          HYPRE_Int *rowid,
+                                          HYPRE_Int *colid,
+                                          HYPRE_Int *idx,
+                                          HYPRE_Int *mark,
+                                          HYPRE_Int  diag_option)
+{
+  HYPRE_Int i = hypre_gpu_get_grid_thread_id<1,1>(item);
+
+  if (i >= n)
+    {
+      return;
+    }
+
+  HYPRE_Int r1 = read_only_load(&rowid[i]);
+  HYPRE_Int c1 = read_only_load(&colid[i]);
+  HYPRE_Int j = read_only_load(&idx[i]);
+
+  if (0 == diag_option)
+    {
+      if (j < nA)
+        {
+          HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
+          HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
+          if (r1 == r2 && c1 == c2)
+            {
+              mark[j] = c1;
+            }
+          else
+            {
+              mark[j] = -1;
+            }
+        }
+    }
+  else if (1 == diag_option)
+    {
+      if (j < nA)
+        {
+          if (r1 == c1)
+            {
+              mark[j] = -2;
+            }
+          else
+            {
+              HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
+              HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
+              if (r1 == r2 && c1 == c2)
+                {
+                  mark[j] = c1;
+                }
+              else
+                {
+                  mark[j] = -1;
+                }
+            }
+        }
+    }
+}
+
+#endif // HYPRE_USING_SYCL
+
+
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
 hypre_CSRMatrix*
@@ -603,51 +1379,264 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix  *A,
    return hypre_error_flag;
 }
 
-__global__ void
-hypreCUDAKernel_CSRMoveDiagFirst( HYPRE_Int      nrows,
-                                  HYPRE_Int     *ia,
-                                  HYPRE_Int     *ja,
-                                  HYPRE_Complex *aa )
+
+HYPRE_Int
+hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
+{
+   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+   dim3           bDim, gDim;
+
+   bDim = hypre_GetDefaultDeviceBlockDimension();
+   gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
+
+   HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim,
+                     nrows, A_i, A_j, A_data);
+
+   hypre_SyncCudaComputeStream(hypre_handle());
+
+   return hypre_error_flag;
+}
+
+HYPRE_Int
+hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
+{
+   if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+   {
+      return 0;
+   }
+
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim);
+
+   HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+   HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRCheckDiagFirst, gDim, bDim,
+                      hypre_CSRMatrixNumRows(A),
+                      hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result );
+
+   HYPRE_Int ierr = HYPRE_THRUST_CALL( reduce,
+                                       result,
+                                       result + hypre_CSRMatrixNumRows(A) );
+
+   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+
+   hypre_SyncCudaComputeStream(hypre_handle());
+
+   return ierr;
+}
+
+/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v
+ * Does NOT assume diagonal is the first entry of each row of A
+ * In debug mode:
+ *    Returns the number of rows that do not have diag in the pattern
+ *    (i.e., structural zeroes on the diagonal)
+ */
+HYPRE_Int
+hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
+                                  HYPRE_Complex    v,
+                                  HYPRE_Real       tol )
+{
+   HYPRE_Int ierr = 0;
+
+   if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+   {
+      return ierr;
+   }
+
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
+
+#if HYPRE_DEBUG
+   HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+#else
+   HYPRE_Int *result = NULL;
+#endif
+
+   HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim,
+                      v, hypre_CSRMatrixNumRows(A),
+                      hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
+                      tol, result );
+
+#if HYPRE_DEBUG
+   ierr = HYPRE_THRUST_CALL( reduce,
+                             result,
+                             result + hypre_CSRMatrixNumRows(A) );
+
+   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+#endif
+
+   hypre_SyncCudaComputeStream(hypre_handle());
+
+   return ierr;
+}
+
+HYPRE_Int
+hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
+                                  HYPRE_Complex   *new_diag,
+                                  HYPRE_Complex    v,
+                                  HYPRE_Real       tol )
+{
+   HYPRE_Int ierr = 0;
+
+   if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+   {
+      return ierr;
+   }
+
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
+
+#if HYPRE_DEBUG
+   HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+#else
+   HYPRE_Int *result = NULL;
+#endif
+
+   HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
+                      new_diag, v, hypre_CSRMatrixNumRows(A),
+                      hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
+                      tol, result );
+
+#if HYPRE_DEBUG
+   ierr = HYPRE_THRUST_CALL( reduce,
+                             result,
+                             result + hypre_CSRMatrixNumRows(A) );
+
+   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+#endif
+
+   hypre_SyncCudaComputeStream(hypre_handle());
+
+   return ierr;
+}
+
+typedef thrust::tuple<HYPRE_Int, HYPRE_Int> Int2;
+struct Int2Unequal : public thrust::unary_function<Int2, bool>
+{
+   __host__ __device__
+   bool operator()(const Int2& t) const
+   {
+      return (thrust::get<0>(t) != thrust::get<1>(t));
+   }
+};
+
+HYPRE_Int
+hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A)
+{
+   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+   HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
+   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+   HYPRE_Int     *A_ii   = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
+   HYPRE_Int      new_nnz;
+   HYPRE_Int     *new_ii;
+   HYPRE_Int     *new_j;
+   HYPRE_Complex *new_data;
+
+   new_nnz = HYPRE_THRUST_CALL( count_if,
+                                thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
+                                thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz,
+                                Int2Unequal() );
+
+   if (new_nnz == nnz)
+   {
+      /* no diagonal entries found */
+      hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
+      return hypre_error_flag;
+   }
+
+   new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+   new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+
+   if (A_data)
+   {
+      new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
+
+      thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*, HYPRE_Complex*> > new_end;
+
+      new_end = HYPRE_THRUST_CALL( copy_if,
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
+                                   thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
+                                   Int2Unequal() );
+
+      hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
+   }
+   else
+   {
+      new_data = NULL;
+
+      thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*> > new_end;
+
+      new_end = HYPRE_THRUST_CALL( copy_if,
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz,
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
+                                   thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j)),
+                                   Int2Unequal() );
+
+      hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
+   }
+
+   hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
+   hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
+   hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
+   hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
+
+   hypre_CSRMatrixNumNonzeros(A) = new_nnz;
+   hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
+   hypre_CSRMatrixJ(A) = new_j;
+   hypre_CSRMatrixData(A) = new_data;
+   hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
+
+   return hypre_error_flag;
+}
+
+void
+hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
+                                    HYPRE_Int       *CF_i,
+                                    HYPRE_Int       *CF_j,
+                                    HYPRE_Complex   *row_sum,
+                                    HYPRE_Int        type,
+                                    HYPRE_Complex    scal,
+                                    const char      *set_or_add)
 {
-   HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+   dim3           bDim, gDim;
 
-   if (row >= nrows)
+   bDim = hypre_GetDefaultDeviceBlockDimension();
+   gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
+
+   if (type == 0)
    {
-      return;
+      HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
+                         row_sum, scal, set_or_add[0] == 's' );
    }
-
-   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-   HYPRE_Int p = 0, q = 0;
-
-   if (lane < 2)
+   else if (type == 1)
    {
-      p = read_only_load(ia + row + lane);
+      HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
+                         row_sum, scal, set_or_add[0] == 's' );
    }
-   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
-
-   for (HYPRE_Int j = p + lane + 1; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+   else if (type == 2)
    {
-      hypre_int find_diag = j < q && ja[j] == row;
-
-      if (find_diag)
-      {
-         ja[j] = ja[p];
-         ja[p] = row;
-         HYPRE_Complex tmp = aa[p];
-         aa[p] = aa[j];
-         aa[j] = tmp;
-      }
-
-      if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
-      {
-         break;
-      }
+      HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
+                         row_sum, scal, set_or_add[0] == 's' );
    }
+
+   hypre_SyncCudaComputeStream(hypre_handle());
 }
 
-HYPRE_Int
-hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
+void
+hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
+                                      HYPRE_Complex   *d,
+                                      HYPRE_Int        type)
 {
    HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
    HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
@@ -658,516 +1647,957 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
    bDim = hypre_GetDefaultDeviceBlockDimension();
    gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMoveDiagFirst, gDim, bDim,
-                     nrows, A_i, A_j, A_data);
+   HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
 
    hypre_SyncCudaComputeStream(hypre_handle());
+}
 
-   return hypre_error_flag;
+/* return C = [A; B] */
+hypre_CSRMatrix*
+hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B)
+{
+   hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) );
+
+   hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B),
+                                               hypre_CSRMatrixNumCols(A),
+                                               hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) );
+
+   HYPRE_Int     *C_i = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE);
+   HYPRE_Int     *C_j = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
+   HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
+
+   hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1,
+                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+   hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B),
+                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+   HYPRE_THRUST_CALL( transform,
+                      C_i + hypre_CSRMatrixNumRows(A) + 1,
+                      C_i + hypre_CSRMatrixNumRows(C) + 1,
+                      thrust::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)),
+                      C_i + hypre_CSRMatrixNumRows(A) + 1,
+                      thrust::plus<HYPRE_Int>() );
+
+   hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A),
+                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+   hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B),
+                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+   hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A),
+                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+   hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B),
+                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+   hypre_CSRMatrixI(C) = C_i;
+   hypre_CSRMatrixJ(C) = C_j;
+   hypre_CSRMatrixData(C) = C_a;
+   hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
+
+   return C;
 }
 
-/* check if diagonal entry is the first one at each row
- * Return: the number of rows that do not have the first entry as diagonal
- * RL: only check if it's a non-empty row
- */
-__global__ void
-hypreCUDAKernel_CSRCheckDiagFirst( HYPRE_Int  nrows,
-                                   HYPRE_Int *ia,
-                                   HYPRE_Int *ja,
-                                   HYPRE_Int *result )
+/* A = alp * I */
+hypre_CSRMatrix *
+hypre_CSRMatrixIdentityDevice(HYPRE_Int n, HYPRE_Complex alp)
 {
-   const HYPRE_Int row = hypre_cuda_get_grid_thread_id<1,1>();
-   if (row < nrows)
+   hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n);
+
+   hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE);
+
+   HYPRE_THRUST_CALL( sequence,
+                      hypre_CSRMatrixI(A),
+                      hypre_CSRMatrixI(A) + n + 1,
+                      0  );
+
+   HYPRE_THRUST_CALL( sequence,
+                      hypre_CSRMatrixJ(A),
+                      hypre_CSRMatrixJ(A) + n,
+                      0  );
+
+   HYPRE_THRUST_CALL( fill,
+                      hypre_CSRMatrixData(A),
+                      hypre_CSRMatrixData(A) + n,
+                      alp );
+
+   return A;
+}
+
+/* this predicate compares first and second element in a tuple in absolute value */
+/* first is assumed to be complex, second to be real > 0 */
+struct cabsfirst_greaterthan_second_pred : public thrust::unary_function<thrust::tuple<HYPRE_Complex, HYPRE_Real>,bool>
+{
+   __host__ __device__
+   bool operator()(const thrust::tuple<HYPRE_Complex, HYPRE_Real>& t) const
    {
-      result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row);
+      const HYPRE_Complex i = thrust::get<0>(t);
+      const HYPRE_Real j = thrust::get<1>(t);
+
+      return hypre_cabs(i) > j;
    }
-}
+};
 
+/* drop the entries that are smaller than:
+ *    tol if elmt_tols == null,
+ *    elmt_tols[j] otherwise where j = 0...NumNonzeros(A) */
 HYPRE_Int
-hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
+hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A,
+                                       HYPRE_Real       tol,
+                                       HYPRE_Real      *elmt_tols)
 {
-   if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+   HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
+   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+   HYPRE_Int     *A_ii   = NULL;
+   HYPRE_Int      new_nnz = 0;
+   HYPRE_Int     *new_ii;
+   HYPRE_Int     *new_j;
+   HYPRE_Complex *new_data;
+
+   if (elmt_tols == NULL)
    {
-      return 0;
+      new_nnz = HYPRE_THRUST_CALL( count_if,
+                                   A_data,
+                                   A_data + nnz,
+                                   thrust::not1(less_than<HYPRE_Complex>(tol)) );
+   }
+   else
+   {
+      new_nnz = HYPRE_THRUST_CALL( count_if,
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)),
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)) + nnz,
+                                   cabsfirst_greaterthan_second_pred() );
    }
 
-   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
-   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim);
+   if (new_nnz == nnz)
+   {
+      hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
+      return hypre_error_flag;
+   }
 
-   HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRCheckDiagFirst, gDim, bDim,
-                      hypre_CSRMatrixNumRows(A),
-                      hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result );
+   if (!A_ii)
+   {
+      A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
+   }
+   new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+   new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+   new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
 
-   HYPRE_Int ierr = HYPRE_THRUST_CALL( reduce,
-                                       result,
-                                       result + hypre_CSRMatrixNumRows(A) );
+   thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*, HYPRE_Complex*> > new_end;
 
-   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+   if (elmt_tols == NULL)
+   {
+      new_end = HYPRE_THRUST_CALL( copy_if,
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
+                                   A_data,
+                                   thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
+                                   thrust::not1(less_than<HYPRE_Complex>(tol)) );
+   }
+   else
+   {
+      new_end = HYPRE_THRUST_CALL( copy_if,
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
+                                   thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)),
+                                   thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
+                                   cabsfirst_greaterthan_second_pred() );
+   }
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
 
-   return ierr;
+   hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
+   hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
+   hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
+   hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
+
+   hypre_CSRMatrixNumNonzeros(A) = new_nnz;
+   hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
+   hypre_CSRMatrixJ(A) = new_j;
+   hypre_CSRMatrixData(A) = new_data;
+   hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
+
+   return hypre_error_flag;
 }
 
-__global__ void
-hypreCUDAKernel_CSRMatrixFixZeroDiagDevice( HYPRE_Complex  v,
-                                            HYPRE_Int      nrows,
-                                            HYPRE_Int     *ia,
-                                            HYPRE_Int     *ja,
-                                            HYPRE_Complex *data,
-                                            HYPRE_Real     tol,
-                                            HYPRE_Int     *result )
+/* markA: array of size nnz(A), for pattern of (A and B), markA is the column indices as in A_J
+ * Otherwise, mark pattern not in A-B as -1 in markA
+ * Note the special treatment for diagonal entries of A (marked as -2) */
+HYPRE_Int
+hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A,
+                                hypre_CSRMatrix *B,
+                                HYPRE_Int       *markA,
+                                HYPRE_Int        diag_opt)
 {
-   const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+   HYPRE_Int nrows = hypre_CSRMatrixNumRows(A);
+   HYPRE_Int nnzA  = hypre_CSRMatrixNumNonzeros(A);
+   HYPRE_Int nnzB  = hypre_CSRMatrixNumNonzeros(B);
 
-   if (row >= nrows)
-   {
-      return;
-   }
+   HYPRE_Int *Cii = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
+   HYPRE_Int *Cjj = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
+   HYPRE_Int *idx = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
 
-   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-   HYPRE_Int p = 0, q = 0;
-   bool has_diag = false;
+   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzA, hypre_CSRMatrixI(A), Cii);
+   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA);
+   hypre_TMemcpy(Cjj,        hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+   hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+   HYPRE_THRUST_CALL( sequence, idx, idx + nnzA + nnzB );
 
-   if (lane < 2)
-   {
-      p = read_only_load(ia + row + lane);
-   }
-   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+   HYPRE_THRUST_CALL( stable_sort_by_key,
+                      thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)),
+                      thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)) + nnzA + nnzB,
+                      idx );
 
-   for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
-   {
-      hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
+   hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
 
-      if (find_diag)
-      {
-         if (fabs(data[j]) <= tol)
-         {
-            data[j] = v;
-         }
-      }
+   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+   dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim);
 
-      if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
-      {
-         has_diag = true;
-         break;
-      }
-   }
+   HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixIntersectPattern, gDim, bDim,
+                      nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt );
 
-   if (result && !has_diag && lane == 0)
-   {
-      result[row] = 1;
-   }
-}
+   hypre_TFree(Cii, HYPRE_MEMORY_DEVICE);
+   hypre_TFree(Cjj, HYPRE_MEMORY_DEVICE);
+   hypre_TFree(idx, HYPRE_MEMORY_DEVICE);
 
-/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v
- * Does NOT assume diagonal is the first entry of each row of A
- * In debug mode:
- *    Returns the number of rows that do not have diag in the pattern
- *    (i.e., structural zeroes on the diagonal)
- */
-HYPRE_Int
-hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
-                                  HYPRE_Complex    v,
-                                  HYPRE_Real       tol )
-{
-   HYPRE_Int ierr = 0;
+   return hypre_error_flag;
+}
 
-   if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
-   {
-      return ierr;
-   }
+#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */
 
-   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
-   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
+#if defined(HYPRE_USING_SYCL)
 
-#if HYPRE_DEBUG
-   HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
-#else
-   HYPRE_Int *result = NULL;
-#endif
+hypre_CSRMatrix*
+hypre_CSRMatrixAddDevice ( HYPRE_Complex    alpha,
+                           hypre_CSRMatrix *A,
+                           HYPRE_Complex    beta,
+                           hypre_CSRMatrix *B     )
+{
+  HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
+  HYPRE_Int        *A_i      = hypre_CSRMatrixI(A);
+  HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
+  HYPRE_Int         nrows_A  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
+  HYPRE_Int         nnz_A    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Complex    *B_data   = hypre_CSRMatrixData(B);
+  HYPRE_Int        *B_i      = hypre_CSRMatrixI(B);
+  HYPRE_Int        *B_j      = hypre_CSRMatrixJ(B);
+  HYPRE_Int         nrows_B  = hypre_CSRMatrixNumRows(B);
+  HYPRE_Int         ncols_B  = hypre_CSRMatrixNumCols(B);
+  HYPRE_Int         nnz_B    = hypre_CSRMatrixNumNonzeros(B);
+  HYPRE_Complex    *C_data;
+  HYPRE_Int        *C_i;
+  HYPRE_Int        *C_j;
+  HYPRE_Int         nnzC;
+  hypre_CSRMatrix  *C;
+
+  if (nrows_A != nrows_B || ncols_A != ncols_B)
+    {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! Incompatible matrix dimensions!\n");
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim,
-                      v, hypre_CSRMatrixNumRows(A),
-                      hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
-                      tol, result );
+      return NULL;
+    }
 
-#if HYPRE_DEBUG
-   ierr = HYPRE_THRUST_CALL( reduce,
-                             result,
-                             result + hypre_CSRMatrixNumRows(A) );
+  hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B,
+                       A_i, A_j, alpha, A_data, NULL, B_i, B_j, beta, B_data, NULL, NULL,
+                       &nnzC, &C_i, &C_j, &C_data);
 
-   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
-#endif
+  C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC);
+  hypre_CSRMatrixI(C) = C_i;
+  hypre_CSRMatrixJ(C) = C_j;
+  hypre_CSRMatrixData(C) = C_data;
+  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncCudaComputeStream(hypre_handle());
 
-   return ierr;
+  return C;
 }
 
-__global__ void
-hypreCUDAKernel_CSRMatrixReplaceDiagDevice( HYPRE_Complex *new_diag,
-                                            HYPRE_Complex  v,
-                                            HYPRE_Int      nrows,
-                                            HYPRE_Int     *ia,
-                                            HYPRE_Int     *ja,
-                                            HYPRE_Complex *data,
-                                            HYPRE_Real     tol,
-                                            HYPRE_Int     *result )
+hypre_CSRMatrix*
+hypre_CSRMatrixMultiplyDevice( hypre_CSRMatrix *A,
+                               hypre_CSRMatrix *B)
 {
-   const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
+  HYPRE_Int         nrows_B  = hypre_CSRMatrixNumRows(B);
+  hypre_CSRMatrix  *C;
 
-   if (row >= nrows)
-   {
-      return;
-   }
+  if (ncols_A != nrows_B)
+    {
+      hypre_printf("Warning! incompatible matrix dimensions!\n");
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! incompatible matrix dimensions!\n");
 
-   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-   HYPRE_Int p = 0, q = 0;
-   bool has_diag = false;
+      return NULL;
+    }
 
-   if (lane < 2)
-   {
-      p = read_only_load(ia + row + lane);
-   }
-   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+  hypreDevice_CSRSpGemm(A, B, &C);
 
-   for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
-   {
-      hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
+  hypre_SyncCudaComputeStream(hypre_handle());
 
-      if (find_diag)
-      {
-         HYPRE_Complex d = read_only_load(&new_diag[row]);
-         if (fabs(d) <= tol)
-         {
-            d = v;
-         }
-         data[j] = d;
-      }
+  return C;
+}
 
-      if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
-      {
-         has_diag = true;
-         break;
-      }
-   }
+hypre_CSRMatrix*
+hypre_CSRMatrixTripleMultiplyDevice ( hypre_CSRMatrix *A,
+                                      hypre_CSRMatrix *B,
+                                      hypre_CSRMatrix *C )
+{
+  hypre_CSRMatrix *BC  = hypre_CSRMatrixMultiplyDevice(B, C);
+  hypre_CSRMatrix *ABC = hypre_CSRMatrixMultiplyDevice(A, BC);
 
-   if (result && !has_diag && lane == 0)
-   {
-      result[row] = 1;
-   }
+  hypre_CSRMatrixDestroy(BC);
+
+  return ABC;
 }
 
 HYPRE_Int
-hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
-                                  HYPRE_Complex   *new_diag,
-                                  HYPRE_Complex    v,
-                                  HYPRE_Real       tol )
+hypre_CSRMatrixTriLowerUpperSolveDevice(char             uplo,
+                                        hypre_CSRMatrix *A,
+                                        HYPRE_Real      *l1_norms,
+                                        hypre_Vector    *f,
+                                        hypre_Vector    *u )
 {
-   HYPRE_Int ierr = 0;
+#if defined(HYPRE_USING_CUSPARSE)
+  hypre_CSRMatrixTriLowerUpperSolveCusparse(uplo, A, l1_norms, f, u);
+#elif defined(HYPRE_USING_ROCSPARSE)
+  hypre_CSRMatrixTriLowerUpperSolveRocsparse(uplo, A, l1_norms, f, u);
+#else
+  hypre_error_w_msg(HYPRE_ERROR_GENERIC, "hypre_CSRMatrixTriLowerUpperSolveDevice requires configuration with either cusparse or rocsparse\n");
+#endif
+  return hypre_error_flag;
+}
 
-   if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
-   {
-      return ierr;
-   }
+/* split CSR matrix B_ext (extended rows of parcsr B) into diag part and offd part
+ * corresponding to B.
+ * Input  col_map_offd_B:
+ * Output col_map_offd_C: union of col_map_offd_B and offd-indices of Bext_offd
+ *        map_B_to_C: mapping from col_map_offd_B to col_map_offd_C
+ */
 
-   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
-   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
+HYPRE_Int
+hypre_CSRMatrixSplitDevice( hypre_CSRMatrix  *B_ext,
+                            HYPRE_BigInt      first_col_diag_B,
+                            HYPRE_BigInt      last_col_diag_B,
+                            HYPRE_Int         num_cols_offd_B,
+                            HYPRE_BigInt     *col_map_offd_B,
+                            HYPRE_Int       **map_B_to_C_ptr,
+                            HYPRE_Int        *num_cols_offd_C_ptr,
+                            HYPRE_BigInt    **col_map_offd_C_ptr,
+                            hypre_CSRMatrix **B_ext_diag_ptr,
+                            hypre_CSRMatrix **B_ext_offd_ptr )
+{
+  HYPRE_Int num_rows = hypre_CSRMatrixNumRows(B_ext);
+  HYPRE_Int B_ext_nnz = hypre_CSRMatrixNumNonzeros(B_ext);
+
+  HYPRE_Int *B_ext_ii = hypre_TAlloc(HYPRE_Int, B_ext_nnz, HYPRE_MEMORY_DEVICE);
+  hypreDevice_CsrRowPtrsToIndices_v2(num_rows, B_ext_nnz, hypre_CSRMatrixI(B_ext), B_ext_ii);
+
+  HYPRE_Int B_ext_diag_nnz;
+  HYPRE_Int B_ext_offd_nnz;
+  HYPRE_Int ierr;
+
+  ierr = hypre_CSRMatrixSplitDevice_core( 0,
+                                          num_rows,
+                                          B_ext_nnz,
+                                          NULL,
+                                          hypre_CSRMatrixBigJ(B_ext),
+                                          NULL,
+                                          NULL,
+                                          first_col_diag_B,
+                                          last_col_diag_B,
+                                          num_cols_offd_B,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          &B_ext_diag_nnz,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          &B_ext_offd_nnz,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          NULL );
+
+  HYPRE_Int     *B_ext_diag_ii = hypre_TAlloc(HYPRE_Int,     B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int     *B_ext_diag_j  = hypre_TAlloc(HYPRE_Int,     B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_Complex *B_ext_diag_a  = hypre_TAlloc(HYPRE_Complex, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+
+  HYPRE_Int     *B_ext_offd_ii = hypre_TAlloc(HYPRE_Int,     B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int     *B_ext_offd_j  = hypre_TAlloc(HYPRE_Int,     B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_Complex *B_ext_offd_a  = hypre_TAlloc(HYPRE_Complex, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
+
+  ierr = hypre_CSRMatrixSplitDevice_core( 1,
+                                          num_rows,
+                                          B_ext_nnz,
+                                          B_ext_ii,
+                                          hypre_CSRMatrixBigJ(B_ext),
+                                          hypre_CSRMatrixData(B_ext),
+                                          NULL,
+                                          first_col_diag_B,
+                                          last_col_diag_B,
+                                          num_cols_offd_B,
+                                          col_map_offd_B,
+                                          map_B_to_C_ptr,
+                                          num_cols_offd_C_ptr,
+                                          col_map_offd_C_ptr,
+                                          &B_ext_diag_nnz,
+                                          B_ext_diag_ii,
+                                          B_ext_diag_j,
+                                          B_ext_diag_a,
+                                          NULL,
+                                          &B_ext_offd_nnz,
+                                          B_ext_offd_ii,
+                                          B_ext_offd_j,
+                                          B_ext_offd_a,
+                                          NULL );
+
+  hypre_TFree(B_ext_ii, HYPRE_MEMORY_DEVICE);
+
+  /* convert to row ptrs */
+  HYPRE_Int *B_ext_diag_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_diag_nnz, B_ext_diag_ii);
+  HYPRE_Int *B_ext_offd_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_offd_nnz, B_ext_offd_ii);
+
+  hypre_TFree(B_ext_diag_ii, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(B_ext_offd_ii, HYPRE_MEMORY_DEVICE);
+
+  /* create diag and offd CSR */
+  hypre_CSRMatrix *B_ext_diag = hypre_CSRMatrixCreate(num_rows, last_col_diag_B - first_col_diag_B + 1, B_ext_diag_nnz);
+  hypre_CSRMatrix *B_ext_offd = hypre_CSRMatrixCreate(num_rows, *num_cols_offd_C_ptr, B_ext_offd_nnz);
+
+  hypre_CSRMatrixI(B_ext_diag) = B_ext_diag_i;
+  hypre_CSRMatrixJ(B_ext_diag) = B_ext_diag_j;
+  hypre_CSRMatrixData(B_ext_diag) = B_ext_diag_a;
+  hypre_CSRMatrixNumNonzeros(B_ext_diag) = B_ext_diag_nnz;
+  hypre_CSRMatrixMemoryLocation(B_ext_diag) = HYPRE_MEMORY_DEVICE;
+
+  hypre_CSRMatrixI(B_ext_offd) = B_ext_offd_i;
+  hypre_CSRMatrixJ(B_ext_offd) = B_ext_offd_j;
+  hypre_CSRMatrixData(B_ext_offd) = B_ext_offd_a;
+  hypre_CSRMatrixNumNonzeros(B_ext_offd) = B_ext_offd_nnz;
+  hypre_CSRMatrixMemoryLocation(B_ext_offd) = HYPRE_MEMORY_DEVICE;
+
+  *B_ext_diag_ptr = B_ext_diag;
+  *B_ext_offd_ptr = B_ext_offd;
+
+  hypre_SyncCudaComputeStream(hypre_handle());
+
+  return ierr;
+}
 
-#if HYPRE_DEBUG
-   HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+HYPRE_Int
+hypre_CSRMatrixSplitDevice_core( HYPRE_Int         job,                 /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */
+                                 HYPRE_Int         num_rows,
+                                 HYPRE_Int         B_ext_nnz,
+                                 HYPRE_Int        *B_ext_ii,            /* Note: this is NOT row pointers as in CSR but row indices as in COO */
+                                 HYPRE_BigInt     *B_ext_bigj,          /* Note: [BigInt] global column indices */
+                                 HYPRE_Complex    *B_ext_data,
+                                 char             *B_ext_xata,          /* companion data with B_ext_data; NULL if none */
+                                 HYPRE_BigInt      first_col_diag_B,
+                                 HYPRE_BigInt      last_col_diag_B,
+                                 HYPRE_Int         num_cols_offd_B,
+                                 HYPRE_BigInt     *col_map_offd_B,
+                                 HYPRE_Int       **map_B_to_C_ptr,
+                                 HYPRE_Int        *num_cols_offd_C_ptr,
+                                 HYPRE_BigInt    **col_map_offd_C_ptr,
+                                 HYPRE_Int        *B_ext_diag_nnz_ptr,
+                                 HYPRE_Int        *B_ext_diag_ii,       /* memory allocated outside */
+                                 HYPRE_Int        *B_ext_diag_j,
+                                 HYPRE_Complex    *B_ext_diag_data,
+                                 char             *B_ext_diag_xata,     /* companion with B_ext_diag_data_ptr; NULL if none */
+                                 HYPRE_Int        *B_ext_offd_nnz_ptr,
+                                 HYPRE_Int        *B_ext_offd_ii,       /* memory allocated outside */
+                                 HYPRE_Int        *B_ext_offd_j,
+                                 HYPRE_Complex    *B_ext_offd_data,
+                                 char             *B_ext_offd_xata      /* companion with B_ext_offd_data_ptr; NULL if none */ )
+{
+  HYPRE_Int      B_ext_diag_nnz;
+  HYPRE_Int      B_ext_offd_nnz;
+  HYPRE_BigInt  *B_ext_diag_bigj = NULL;
+  HYPRE_BigInt  *B_ext_offd_bigj = NULL;
+  HYPRE_BigInt  *col_map_offd_C;
+  HYPRE_Int     *map_B_to_C = NULL;
+  HYPRE_Int      num_cols_offd_C;
+
+  in_range<HYPRE_BigInt> pred1(first_col_diag_B, last_col_diag_B);
+
+  /* get diag and offd nnz */
+  if (job == 0) {
+    /* query the nnz's */
+    B_ext_diag_nnz = HYPRE_ONEDPL_CALL( std::count_if,
+                                        B_ext_bigj,
+                                        B_ext_bigj + B_ext_nnz,
+                                        pred1 );
+    B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz;
+
+    *B_ext_diag_nnz_ptr = B_ext_diag_nnz;
+    *B_ext_offd_nnz_ptr = B_ext_offd_nnz;
+
+    return hypre_error_flag;
+  }
+  else {
+    B_ext_diag_nnz = *B_ext_diag_nnz_ptr;
+    B_ext_offd_nnz = *B_ext_offd_nnz_ptr;
+  }
+
+  /* copy to diag */
+  B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+
+  if (B_ext_diag_xata) {
+    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first,                                                                                           /* first */
+                                      first + B_ext_nnz,                                                                               /* last */
+                                      B_ext_bigj,                                                                                      /* stencil */
+                                      oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata),/* result */
+                                      pred1 );
+
+    //hypre_assert( std::get<0>(new_end.get_iterator_tuple() == B_ext_diag_ii + B_ext_diag_nnz );
+  }
+  else {
+    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first,                                                                             /* first */
+                                      first + B_ext_nnz,                                                                 /* last */
+                                      B_ext_bigj,                                                                        /* stencil */
+                                      oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data),   /* result */
+                                      pred1 );
+
+    //hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz );
+  }
+
+  HYPRE_BigInt *const_iterator = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+  hypre_DeviceDataComputeStream(hypre_handle())->fill(const_iterator, first_col_diag_B, B_ext_diag_nnz*sizeof(HYPRE_BigInt)).wait();
+  HYPRE_ONEDPL_CALL( std::transform,
+                     B_ext_diag_bigj,
+                     B_ext_diag_bigj + B_ext_diag_nnz,
+                     const_iterator, //dpct::make_constant_iterator(first_col_diag_B),
+                     B_ext_diag_j,
+                     std::minus<HYPRE_BigInt>() );
+  hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE);
+
+  hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE);
+
+  /* copy to offd */
+  B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
+
+  if (B_ext_offd_xata) {
+    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first,                                                                                            /* first */
+                                      first + B_ext_nnz,                                                                                /* last */
+                                      B_ext_bigj,                                                                                       /* stencil */
+                                      oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata), /* result */
+                                      std::not1(pred1) );
+
+    // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
+  }
+  else {
+    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first,                                                                           /* first */
+                                      first + B_ext_nnz,                                                               /* last */
+                                      B_ext_bigj,                                                                      /* stencil */
+                                      oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data), /* result */
+                                      std::not1(pred1) );
+
+    // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
+  }
+
+  /* offd map of B_ext_offd Union col_map_offd_B */
+  col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(col_map_offd_C,                  B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz,  HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B,  HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+  HYPRE_ONEDPL_CALL( std::sort,
+                     col_map_offd_C,
+                     col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
+
+  HYPRE_BigInt *new_end = HYPRE_ONEDPL_CALL( std::unique,
+                                             col_map_offd_C,
+                                             col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
+
+  num_cols_offd_C = new_end - col_map_offd_C;
+
+#if 1
+  HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE);
+  col_map_offd_C = tmp;
 #else
-   HYPRE_Int *result = NULL;
+  col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
 #endif
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
-                      new_diag, v, hypre_CSRMatrixNumRows(A),
-                      hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
-                      tol, result );
+  /* create map from col_map_offd_B */
+  if (num_cols_offd_B) {
+    map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE);
+    HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
+                       col_map_offd_C,
+                       col_map_offd_C + num_cols_offd_C,
+                       col_map_offd_B,
+                       col_map_offd_B + num_cols_offd_B,
+                       map_B_to_C );
+  }
+
+  HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
+                     col_map_offd_C,
+                     col_map_offd_C + num_cols_offd_C,
+                     B_ext_offd_bigj,
+                     B_ext_offd_bigj + B_ext_offd_nnz,
+                     B_ext_offd_j );
+
+  hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE);
+
+  if (map_B_to_C_ptr) {
+    *map_B_to_C_ptr   = map_B_to_C;
+  }
+  *num_cols_offd_C_ptr = num_cols_offd_C;
+  *col_map_offd_C_ptr  = col_map_offd_C;
+
+  return hypre_error_flag;
+}
 
-#if HYPRE_DEBUG
-   ierr = HYPRE_THRUST_CALL( reduce,
-                             result,
-                             result + hypre_CSRMatrixNumRows(A) );
+/*--------------------------------------------------------------------------
+ * hypre_CSRMatrixAddPartial:
+ * adds matrix rows in the CSR matrix B to the CSR Matrix A, where row_nums[i]
+ * defines to which row of A the i-th row of B is added, and returns a CSR Matrix C;
+ * Repeated row indices are allowed in row_nums
+ * Note: The routine does not check for 0-elements which might be generated
+ *       through cancellation of elements in A and B or already contained
+ *       in A and B. To remove those, use hypre_CSRMatrixDeleteZeros
+ *--------------------------------------------------------------------------*/
+
+hypre_CSRMatrix*
+hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A,
+                                 hypre_CSRMatrix *B,
+                                 HYPRE_Int       *row_nums)
+{
+  HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
+  HYPRE_Int        *A_i      = hypre_CSRMatrixI(A);
+  HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
+  HYPRE_Int         nrows_A  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
+  HYPRE_Int         nnz_A    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Complex    *B_data   = hypre_CSRMatrixData(B);
+  HYPRE_Int        *B_i      = hypre_CSRMatrixI(B);
+  HYPRE_Int        *B_j      = hypre_CSRMatrixJ(B);
+  HYPRE_Int         nrows_B  = hypre_CSRMatrixNumRows(B);
+  HYPRE_Int         ncols_B  = hypre_CSRMatrixNumCols(B);
+  HYPRE_Int         nnz_B    = hypre_CSRMatrixNumNonzeros(B);
+  HYPRE_Complex    *C_data;
+  HYPRE_Int        *C_i;
+  HYPRE_Int        *C_j;
+  HYPRE_Int         nnzC;
+  hypre_CSRMatrix  *C;
+
+  if (ncols_A != ncols_B)
+    {
+      hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! incompatible matrix dimensions!\n");
+
+      return NULL;
+    }
 
-   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
-#endif
+  hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B, A_i, A_j, 1.0, A_data, NULL, B_i, B_j, 1.0, B_data, NULL, row_nums,
+                       &nnzC, &C_i, &C_j, &C_data);
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+  C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC);
+  hypre_CSRMatrixI(C) = C_i;
+  hypre_CSRMatrixJ(C) = C_j;
+  hypre_CSRMatrixData(C) = C_data;
+  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
 
-   return ierr;
-}
+  hypre_SyncCudaComputeStream(hypre_handle());
 
-typedef thrust::tuple<HYPRE_Int, HYPRE_Int> Int2;
-struct Int2Unequal : public thrust::unary_function<Int2, bool>
-{
-   __host__ __device__
-   bool operator()(const Int2& t) const
-   {
-      return (thrust::get<0>(t) != thrust::get<1>(t));
-   }
-};
+  return C;
+}
 
 HYPRE_Int
-hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A)
+hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix  *A,
+                                 HYPRE_Real       *colnnz)
 {
-   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
-   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-   HYPRE_Int     *A_ii   = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
-   HYPRE_Int      new_nnz;
-   HYPRE_Int     *new_ii;
-   HYPRE_Int     *new_j;
-   HYPRE_Complex *new_data;
+  HYPRE_Int *A_j      = hypre_CSRMatrixJ(A);
+  HYPRE_Int  ncols_A  = hypre_CSRMatrixNumCols(A);
+  HYPRE_Int  nnz_A    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Int *A_j_sorted;
+  HYPRE_Int  num_reduced_col_indices;
+  HYPRE_Int *reduced_col_indices;
+  HYPRE_Int *reduced_col_nnz;
+
+  A_j_sorted = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(A_j_sorted, A_j, HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  HYPRE_ONEDPL_CALL(std::sort, A_j_sorted, A_j_sorted + nnz_A);
+
+  reduced_col_indices = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
+  reduced_col_nnz     = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
+
+  // ABB: Replace values in-place with dpct::make_constant_iterator(1)
+  HYPRE_Int* values = hypre_TAlloc(HYPRE_Int, nnz_A, hypre_MEMORY_UNIFIED);
+  hypre_DeviceDataComputeStream(hypre_handle())->fill(values, 1, nnz_A*sizeof(HYPRE_Int)).wait();
+  std::pair<HYPRE_Int*, HYPRE_Int*> new_end =
+    HYPRE_ONEDPL_CALL( oneapi::dpl::reduce_by_segment, A_j_sorted, A_j_sorted + nnz_A,
+                       values,
+                       reduced_col_indices,
+                       reduced_col_nnz );
+
+  hypre_assert(new_end.first - reduced_col_indices == new_end.second - reduced_col_nnz);
+
+  num_reduced_col_indices = new_end.first - reduced_col_indices;
+
+  hypre_Memset(colnnz, 0, ncols_A * sizeof(HYPRE_Real), HYPRE_MEMORY_DEVICE);
+  HYPRE_ONEDPL_CALL( oneapi::dpl::copy, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices,
+                     oneapi::dpl::make_permutation_iterator(colnnz, reduced_col_indices) );
+
+  hypre_TFree(A_j_sorted,          HYPRE_MEMORY_DEVICE);
+  hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(reduced_col_nnz,     HYPRE_MEMORY_DEVICE);
+  hypre_TFree(values,              HYPRE_MEMORY_UNIFIED);
+
+  hypre_SyncCudaComputeStream(hypre_handle());
+
+  return hypre_error_flag;
+}
 
-   new_nnz = HYPRE_THRUST_CALL( count_if,
-                                thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
-                                thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz,
-                                Int2Unequal() );
 
-   if (new_nnz == nnz)
-   {
-      /* no diagonal entries found */
-      hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
-      return hypre_error_flag;
-   }
+HYPRE_Int
+hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
+{
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+  sycl::range<1> bDim, gDim;
 
-   new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-   new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+  bDim = hypre_GetDefaultDeviceBlockDimension();
+  gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
-   if (A_data)
-   {
-      new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim,
+                   nrows, A_i, A_j, A_data);
 
-      thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*, HYPRE_Complex*> > new_end;
+  hypre_SyncCudaComputeStream(hypre_handle());
 
-      new_end = HYPRE_THRUST_CALL( copy_if,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
-                                   Int2Unequal() );
+  return hypre_error_flag;
+}
 
-      hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
-   }
-   else
-   {
-      new_data = NULL;
+HYPRE_Int
+hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
+{
+  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+    {
+      return 0;
+    }
 
-      thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*> > new_end;
+  sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
+  sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim);
 
-      new_end = HYPRE_THRUST_CALL( copy_if,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j)),
-                                   Int2Unequal() );
+  HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRCheckDiagFirst, gDim, bDim,
+                    hypre_CSRMatrixNumRows(A),
+                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result );
 
-      hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
-   }
+  HYPRE_Int ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
+                                      result,
+                                      result + hypre_CSRMatrixNumRows(A) );
 
-   hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
-   hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
-   hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
-   hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
 
-   hypre_CSRMatrixNumNonzeros(A) = new_nnz;
-   hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
-   hypre_CSRMatrixJ(A) = new_j;
-   hypre_CSRMatrixData(A) = new_data;
-   hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
+  hypre_SyncCudaComputeStream(hypre_handle());
 
-   return hypre_error_flag;
+  return ierr;
 }
 
-/* type == 0, sum,
- *         1, abs sum (l-1)
- *         2, square sum (l-2)
+/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v
+ * Does NOT assume diagonal is the first entry of each row of A
+ * In debug mode:
+ *    Returns the number of rows that do not have diag in the pattern
+ *    (i.e., structural zeroes on the diagonal)
  */
-template<HYPRE_Int type>
-__global__ void
-hypreCUDAKernel_CSRRowSum( HYPRE_Int      nrows,
-                           HYPRE_Int     *ia,
-                           HYPRE_Int     *ja,
-                           HYPRE_Complex *aa,
-                           HYPRE_Int     *CF_i,
-                           HYPRE_Int     *CF_j,
-                           HYPRE_Complex *row_sum,
-                           HYPRE_Complex  scal,
-                           HYPRE_Int      set)
+HYPRE_Int
+hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
+                                  HYPRE_Complex    v,
+                                  HYPRE_Real       tol )
 {
-   HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>();
-
-   if (row_i >= nrows)
-   {
-      return;
-   }
-
-   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-   HYPRE_Int p = 0, q = 0;
-
-   if (lane < 2)
-   {
-      p = read_only_load(ia + row_i + lane);
-   }
-   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
-
-   HYPRE_Complex row_sum_i = 0.0;
-
-   for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
-   {
-      if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) )
-      {
-         continue;
-      }
+  HYPRE_Int ierr = 0;
 
-      HYPRE_Complex aii = aa[j];
+  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+    {
+      return ierr;
+    }
 
-      if (type == 0)
-      {
-         row_sum_i += aii;
-      }
-      else if (type == 1)
-      {
-         row_sum_i += fabs(aii);
-      }
-      else if (type == 2)
-      {
-         row_sum_i += aii * aii;
-      }
-   }
+  sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
+  sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
 
-   row_sum_i = warp_reduce_sum(row_sum_i);
+#if HYPRE_DEBUG
+  HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+#else
+  HYPRE_Int *result = NULL;
+#endif
 
-   if (lane == 0)
-   {
-      if (set)
-      {
-         row_sum[row_i] = scal * row_sum_i;
-      }
-      else
-      {
-         row_sum[row_i] += scal * row_sum_i;
-      }
-   }
-}
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim,
+                    v, hypre_CSRMatrixNumRows(A),
+                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
+                    tol, result );
 
-void
-hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
-                                    HYPRE_Int       *CF_i,
-                                    HYPRE_Int       *CF_j,
-                                    HYPRE_Complex   *row_sum,
-                                    HYPRE_Int        type,
-                                    HYPRE_Complex    scal,
-                                    const char      *set_or_add)
-{
-   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-   dim3           bDim, gDim;
+#if HYPRE_DEBUG
+  ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
+                            result,
+                            result + hypre_CSRMatrixNumRows(A) );
 
-   bDim = hypre_GetDefaultDeviceBlockDimension();
-   gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
+  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+#endif
 
-   if (type == 0)
-   {
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
-                         row_sum, scal, set_or_add[0] == 's' );
-   }
-   else if (type == 1)
-   {
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
-                         row_sum, scal, set_or_add[0] == 's' );
-   }
-   else if (type == 2)
-   {
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
-                         row_sum, scal, set_or_add[0] == 's' );
-   }
+  hypre_SyncCudaComputeStream(hypre_handle());
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+  return ierr;
 }
 
-/* type 0: diag
- *      1: abs diag
- *      2: diag inverse
- *      3: diag inverse sqrt
- *      4: abs diag inverse sqrt
- */
-__global__ void
-hypreCUDAKernel_CSRExtractDiag( HYPRE_Int      nrows,
-                                HYPRE_Int     *ia,
-                                HYPRE_Int     *ja,
-                                HYPRE_Complex *aa,
-                                HYPRE_Complex *d,
-                                HYPRE_Int      type)
+HYPRE_Int
+hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
+                                  HYPRE_Complex   *new_diag,
+                                  HYPRE_Complex    v,
+                                  HYPRE_Real       tol )
 {
-   HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
-
-   if (row >= nrows)
-   {
-      return;
-   }
+  HYPRE_Int ierr = 0;
 
-   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-   HYPRE_Int p = 0, q = 0;
-
-   if (lane < 2)
-   {
-      p = read_only_load(ia + row + lane);
-   }
-   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+    {
+      return ierr;
+    }
 
-   HYPRE_Int has_diag = 0;
+  sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
+  sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
 
-   for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
-   {
-      hypre_int find_diag = j < q && ja[j] == row;
+#if HYPRE_DEBUG
+  HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+#else
+  HYPRE_Int *result = NULL;
+#endif
 
-      if (find_diag)
-      {
-         if (type == 0)
-         {
-            d[row] = aa[j];
-         }
-         else if (type == 1)
-         {
-            d[row] = fabs(aa[j]);
-         }
-         else if (type == 2)
-         {
-            d[row] = 1.0 / aa[j];
-         }
-         else if (type == 3)
-         {
-            d[row] = 1.0 / sqrt(aa[j]);
-         }
-         else if (type == 4)
-         {
-            d[row] = 1.0 / sqrt(fabs(aa[j]));
-         }
-      }
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
+                    new_diag, v, hypre_CSRMatrixNumRows(A),
+                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
+                    tol, result );
 
-      if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
-      {
-         has_diag = 1;
-         break;
-      }
-   }
+#if HYPRE_DEBUG
+  ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
+                            result,
+                            result + hypre_CSRMatrixNumRows(A) );
 
-   if (!has_diag && lane == 0)
-   {
-      d[row] = 0.0;
-   }
+  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+#endif
+
+  hypre_SyncCudaComputeStream(hypre_handle());
+
+  return ierr;
+}
+
+HYPRE_Int
+hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A)
+{
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_ii   = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
+  HYPRE_Int      new_nnz;
+  HYPRE_Int     *new_ii;
+  HYPRE_Int     *new_j;
+  HYPRE_Complex *new_data;
+
+  auto zipped_begin = oneapi::dpl::make_zip_iterator(A_ii, A_j);
+  new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
+                               zipped_begin, zipped_begin + nnz,
+                               [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
+
+  if (new_nnz == nnz)
+    {
+      /* no diagonal entries found */
+      hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
+      return hypre_error_flag;
+    }
+
+  new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+  new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+
+  if (A_data)
+    {
+      new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
+
+      auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data);
+      auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                        first, first + nnz,
+                                        oneapi::dpl::make_zip_iterator(A_ii, A_j),
+                                        oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data),
+                                        [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
+
+      // todo: fix this
+      // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz );
+    }
+  else
+    {
+      new_data = NULL;
+
+      auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j);
+      auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                        first, first + nnz,
+                                        first,
+                                        oneapi::dpl::make_zip_iterator(new_ii, new_j),
+                                        [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
+
+      // todo: fix this
+      // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz );
+    }
+
+  hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
+
+  hypre_CSRMatrixNumNonzeros(A) = new_nnz;
+  hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
+  hypre_CSRMatrixJ(A) = new_j;
+  hypre_CSRMatrixData(A) = new_data;
+  hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
+
+  return hypre_error_flag;
+}
+
+void
+hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
+                                    HYPRE_Int       *CF_i,
+                                    HYPRE_Int       *CF_j,
+                                    HYPRE_Complex   *row_sum,
+                                    HYPRE_Int        type,
+                                    HYPRE_Complex    scal,
+                                    const char      *set_or_add)
+{
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+  sycl::range<1>           bDim, gDim;
+
+  bDim = hypre_GetDefaultDeviceBlockDimension();
+  gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
+
+  if (type == 0)
+    {
+      HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
+                        row_sum, scal, set_or_add[0] == 's' );
+    }
+  else if (type == 1)
+    {
+      HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
+                        row_sum, scal, set_or_add[0] == 's' );
+    }
+  else if (type == 2)
+    {
+      HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
+                        row_sum, scal, set_or_add[0] == 's' );
+    }
+
+  hypre_SyncCudaComputeStream(hypre_handle());
 }
 
 void
@@ -1175,101 +2605,107 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
                                       HYPRE_Complex   *d,
                                       HYPRE_Int        type)
 {
-   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-   dim3           bDim, gDim;
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+  sycl::range<1>           bDim, gDim;
 
-   bDim = hypre_GetDefaultDeviceBlockDimension();
-   gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
+  bDim = hypre_GetDefaultDeviceBlockDimension();
+  gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncCudaComputeStream(hypre_handle());
 }
 
 /* return C = [A; B] */
 hypre_CSRMatrix*
 hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B)
 {
-   hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) );
-
-   hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B),
-                                               hypre_CSRMatrixNumCols(A),
-                                               hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) );
-
-   HYPRE_Int     *C_i = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE);
-   HYPRE_Int     *C_j = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
-   HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
-
-   hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1,
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-   hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B),
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-   HYPRE_THRUST_CALL( transform,
-                      C_i + hypre_CSRMatrixNumRows(A) + 1,
-                      C_i + hypre_CSRMatrixNumRows(C) + 1,
-                      thrust::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)),
-                      C_i + hypre_CSRMatrixNumRows(A) + 1,
-                      thrust::plus<HYPRE_Int>() );
-
-   hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A),
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-   hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B),
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-
-   hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A),
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-   hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B),
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-
-   hypre_CSRMatrixI(C) = C_i;
-   hypre_CSRMatrixJ(C) = C_j;
-   hypre_CSRMatrixData(C) = C_a;
-   hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
-
-   return C;
+  hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) );
+
+  hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B),
+                                              hypre_CSRMatrixNumCols(A),
+                                              hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) );
+
+  HYPRE_Int     *C_i = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int     *C_j = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
+  HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
+
+  hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1,
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+
+  // ABB: check the right size B_ext_diag_nnz
+  HYPRE_Int *const_iterator = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+  hypre_DeviceDataComputeStream(hypre_handle())->fill(const_iterator, hypre_CSRMatrixNumNonzeros(A), B_ext_diag_nnz*sizeof(HYPRE_Int)).wait();
+  HYPRE_ONEDPL_CALL( std::transform,
+                     C_i + hypre_CSRMatrixNumRows(A) + 1,
+                     C_i + hypre_CSRMatrixNumRows(C) + 1,
+                     const_iterator, //dpct::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)),
+                     C_i + hypre_CSRMatrixNumRows(A) + 1,
+                     std::plus<HYPRE_Int>() );
+  hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE);
+
+
+  hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+  hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+  hypre_CSRMatrixI(C) = C_i;
+  hypre_CSRMatrixJ(C) = C_j;
+  hypre_CSRMatrixData(C) = C_a;
+  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
+
+  return C;
 }
 
 /* A = alp * I */
 hypre_CSRMatrix *
 hypre_CSRMatrixIdentityDevice(HYPRE_Int n, HYPRE_Complex alp)
 {
-   hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n);
+  hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n);
 
-   hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE);
+  hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE);
 
-   HYPRE_THRUST_CALL( sequence,
-                      hypre_CSRMatrixI(A),
-                      hypre_CSRMatrixI(A) + n + 1,
-                      0  );
+  HYPRE_ONEDPL_CALL( dpct::iota,
+                     hypre_CSRMatrixI(A),
+                     hypre_CSRMatrixI(A) + n + 1,
+                     0  );
 
-   HYPRE_THRUST_CALL( sequence,
-                      hypre_CSRMatrixJ(A),
-                      hypre_CSRMatrixJ(A) + n,
-                      0  );
+  HYPRE_ONEDPL_CALL( dpct::iota,
+                     hypre_CSRMatrixJ(A),
+                     hypre_CSRMatrixJ(A) + n,
+                     0  );
 
-   HYPRE_THRUST_CALL( fill,
-                      hypre_CSRMatrixData(A),
-                      hypre_CSRMatrixData(A) + n,
-                      alp );
+  HYPRE_ONEDPL_CALL( std::fill,
+                     hypre_CSRMatrixData(A),
+                     hypre_CSRMatrixData(A) + n,
+                     alp );
 
-   return A;
+  return A;
 }
 
 /* this predicate compares first and second element in a tuple in absolute value */
 /* first is assumed to be complex, second to be real > 0 */
-struct cabsfirst_greaterthan_second_pred : public thrust::unary_function<thrust::tuple<HYPRE_Complex, HYPRE_Real>,bool>
+struct cabsfirst_greaterthan_second_pred
 {
-   __host__ __device__
-   bool operator()(const thrust::tuple<HYPRE_Complex, HYPRE_Real>& t) const
-   {
-      const HYPRE_Complex i = thrust::get<0>(t);
-      const HYPRE_Real j = thrust::get<1>(t);
+  bool operator()(const std::tuple<HYPRE_Complex, HYPRE_Real>& t) const
+  {
+    const HYPRE_Complex i = std::get<0>(t);
+    const HYPRE_Real j = std::get<1>(t);
 
-      return hypre_cabs(i) > j;
-   }
+    return hypre_cabs(i) > j;
+  }
 };
 
 /* drop the entries that are smaller than:
@@ -1280,145 +2716,84 @@ hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A,
                                        HYPRE_Real       tol,
                                        HYPRE_Real      *elmt_tols)
 {
-   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
-   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-   HYPRE_Int     *A_ii   = NULL;
-   HYPRE_Int      new_nnz = 0;
-   HYPRE_Int     *new_ii;
-   HYPRE_Int     *new_j;
-   HYPRE_Complex *new_data;
-
-   if (elmt_tols == NULL)
-   {
-      new_nnz = HYPRE_THRUST_CALL( count_if,
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_ii   = NULL;
+  HYPRE_Int      new_nnz = 0;
+  HYPRE_Int     *new_ii;
+  HYPRE_Int     *new_j;
+  HYPRE_Complex *new_data;
+
+  if (elmt_tols == NULL)
+    {
+      // abb TODO: issue with working here
+      new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
                                    A_data,
                                    A_data + nnz,
-                                   thrust::not1(less_than<HYPRE_Complex>(tol)) );
-   }
-   else
-   {
-      new_nnz = HYPRE_THRUST_CALL( count_if,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)) + nnz,
+                                   std::not_fn(less_than<HYPRE_Complex>(tol)) );
+    }
+  else
+    {
+      auto first = oneapi::dpl::make_zip_iterator(A_data, elmt_tols);
+      new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
+                                   first,
+                                   first + nnz,
                                    cabsfirst_greaterthan_second_pred() );
-   }
+    }
 
-   if (new_nnz == nnz)
-   {
+  if (new_nnz == nnz)
+    {
       hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
       return hypre_error_flag;
-   }
+    }
 
-   if (!A_ii)
-   {
+  if (!A_ii)
+    {
       A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
-   }
-   new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-   new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-   new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
-
-   thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*, HYPRE_Complex*> > new_end;
-
-   if (elmt_tols == NULL)
-   {
-      new_end = HYPRE_THRUST_CALL( copy_if,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
+    }
+  new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+  new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+  new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
+
+  oneapi::dpl::zip_iterator< HYPRE_Int*, HYPRE_Int*, HYPRE_Complex* > new_end;
+
+  if (elmt_tols == NULL)
+    {
+      auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data);
+      new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                   first, first + nnz,
                                    A_data,
-                                   thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
-                                   thrust::not1(less_than<HYPRE_Complex>(tol)) );
-   }
-   else
-   {
-      new_end = HYPRE_THRUST_CALL( copy_if,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
+                                   oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data),
+                                   std::not_fn(less_than<HYPRE_Complex>(tol)) );
+    }
+  else
+    {
+      auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data);
+      new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                   first, first + nnz,
+                                   oneapi::dpl::make_zip_iterator(A_data, elmt_tols),
+                                   oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data),
                                    cabsfirst_greaterthan_second_pred() );
-   }
-
-   hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
-
-   hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
-   hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
-   hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
-   hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
-
-   hypre_CSRMatrixNumNonzeros(A) = new_nnz;
-   hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
-   hypre_CSRMatrixJ(A) = new_j;
-   hypre_CSRMatrixData(A) = new_data;
-   hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
-
-   return hypre_error_flag;
-}
+    }
 
-/* mark is of size nA
- * diag_option: 1: special treatment for diag entries, mark as -2
- */
-__global__ void
-hypreCUDAKernel_CSRMatrixIntersectPattern(HYPRE_Int  n,
-                                          HYPRE_Int  nA,
-                                          HYPRE_Int *rowid,
-                                          HYPRE_Int *colid,
-                                          HYPRE_Int *idx,
-                                          HYPRE_Int *mark,
-                                          HYPRE_Int  diag_option)
-{
-   HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
+  // todo: fix this
+  // hypre_assert( thrust::get<0>(*new_end) == new_ii + new_nnz );
 
-   if (i >= n)
-   {
-      return;
-   }
+  hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
 
-   HYPRE_Int r1 = read_only_load(&rowid[i]);
-   HYPRE_Int c1 = read_only_load(&colid[i]);
-   HYPRE_Int j = read_only_load(&idx[i]);
+  hypre_CSRMatrixNumNonzeros(A) = new_nnz;
+  hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
+  hypre_CSRMatrixJ(A) = new_j;
+  hypre_CSRMatrixData(A) = new_data;
+  hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
 
-   if (0 == diag_option)
-   {
-      if (j < nA)
-      {
-         HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
-         HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
-         if (r1 == r2 && c1 == c2)
-         {
-            mark[j] = c1;
-         }
-         else
-         {
-            mark[j] = -1;
-         }
-      }
-   }
-   else if (1 == diag_option)
-   {
-      if (j < nA)
-      {
-         if (r1 == c1)
-         {
-            mark[j] = -2;
-         }
-         else
-         {
-            HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
-            HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
-            if (r1 == r2 && c1 == c2)
-            {
-               mark[j] = c1;
-            }
-            else
-            {
-               mark[j] = -1;
-            }
-         }
-      }
-   }
+  return hypre_error_flag;
 }
 
 /* markA: array of size nnz(A), for pattern of (A and B), markA is the column indices as in A_J
@@ -1430,41 +2805,42 @@ hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A,
                                 HYPRE_Int       *markA,
                                 HYPRE_Int        diag_opt)
 {
-   HYPRE_Int nrows = hypre_CSRMatrixNumRows(A);
-   HYPRE_Int nnzA  = hypre_CSRMatrixNumNonzeros(A);
-   HYPRE_Int nnzB  = hypre_CSRMatrixNumNonzeros(B);
+  HYPRE_Int nrows = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int nnzA  = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Int nnzB  = hypre_CSRMatrixNumNonzeros(B);
 
-   HYPRE_Int *Cii = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
-   HYPRE_Int *Cjj = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
-   HYPRE_Int *idx = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int *Cii = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int *Cjj = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int *idx = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
 
-   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzA, hypre_CSRMatrixI(A), Cii);
-   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA);
-   hypre_TMemcpy(Cjj,        hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-   hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-   HYPRE_THRUST_CALL( sequence, idx, idx + nnzA + nnzB );
+  hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzA, hypre_CSRMatrixI(A), Cii);
+  hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA);
+  hypre_TMemcpy(Cjj,        hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  HYPRE_ONEDPL_CALL( dpct::iota, idx, idx + nnzA + nnzB, 0 );
 
-   HYPRE_THRUST_CALL( stable_sort_by_key,
-                      thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)),
-                      thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)) + nnzA + nnzB,
-                      idx );
+  auto keys_begin = oneapi::dpl::make_zip_iterator(Cii, Cjj);
+  auto zipped_begin = oneapi::dpl::make_zip_iterator(keys_begin, idx);
+  HYPRE_ONEDPL_CALL( std::stable_sort, zipped_begin, zipped_begin + nnzA + nnzB,
+                     [](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); } );
 
-   hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
 
-   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
-   dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim);
+  sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
+  sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixIntersectPattern, gDim, bDim,
-                      nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt );
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixIntersectPattern, gDim, bDim,
+                    nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt );
 
-   hypre_TFree(Cii, HYPRE_MEMORY_DEVICE);
-   hypre_TFree(Cjj, HYPRE_MEMORY_DEVICE);
-   hypre_TFree(idx, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(Cii, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(Cjj, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(idx, HYPRE_MEMORY_DEVICE);
 
-   return hypre_error_flag;
+  return hypre_error_flag;
 }
 
-#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */
+#endif /* HYPRE_USING_SYCL */
+
 
 #if defined(HYPRE_USING_GPU)
 
diff --git a/src/seq_mv/csr_matrix.c b/src/seq_mv/csr_matrix.c
index f387de02e2..7ea19c3b56 100644
--- a/src/seq_mv/csr_matrix.c
+++ b/src/seq_mv/csr_matrix.c
@@ -44,7 +44,7 @@ hypre_CSRMatrixCreate( HYPRE_Int num_rows,
    /* set defaults */
    hypre_CSRMatrixOwnsData(matrix)       = 1;
 
-#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
+#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
    hypre_CSRMatrixSortedJ(matrix)        = NULL;
    hypre_CSRMatrixSortedData(matrix)     = NULL;
    hypre_CSRMatrixCsrsvData(matrix)      = NULL;
diff --git a/src/seq_mv/csr_spgemm_device_attempt.c b/src/seq_mv/csr_spgemm_device_attempt.c
index d6b23a99d8..8fd268de09 100644
--- a/src/seq_mv/csr_spgemm_device_attempt.c
+++ b/src/seq_mv/csr_spgemm_device_attempt.c
@@ -501,7 +501,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int       m,
       // for cases where one WARP works on a row
       dim3 gDim( (m + bDim.z - 1) / bDim.z );
 
-      HYPRE_CUDA_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 1, hash_type>),
+      HYPRE_GPU_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 1, hash_type>),
                           gDim, bDim, /* shmem_size, */
                           m, NULL, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_js, d_as, d_ghash1_i, d_ghash1_j, d_ghash1_a,
                           d_rc, d_rf );
@@ -537,7 +537,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int       m,
       // for cases where one WARP works on a row
       dim3 gDim( (num_failed_rows + bDim.z - 1) / bDim.z );
 
-      HYPRE_CUDA_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 2, hash_type>),
+      HYPRE_GPU_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 2, hash_type>),
                           gDim, bDim, /* shmem_size, */
                           num_failed_rows, rf_ind, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_js, d_as, d_ghash2_i, d_ghash2_j, d_ghash2_a,
                           d_rc, NULL );
@@ -557,7 +557,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int       m,
       // for cases where one WARP works on a row
       dim3 gDim( (m + bDim.z - 1) / bDim.z );
 
-      HYPRE_CUDA_LAUNCH( (hypre_spgemm_copy_from_hash_into_C<num_warps_per_block, shmem_hash_size>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_spgemm_copy_from_hash_into_C<num_warps_per_block, shmem_hash_size>), gDim, bDim,
                           m, d_rf,
                           d_js, d_as,
                           d_ghash1_i, d_ghash1_j, d_ghash1_a,
diff --git a/src/seq_mv/csr_spgemm_device_confident.c b/src/seq_mv/csr_spgemm_device_confident.c
index 86633323bc..a9b5a494df 100644
--- a/src/seq_mv/csr_spgemm_device_confident.c
+++ b/src/seq_mv/csr_spgemm_device_confident.c
@@ -461,7 +461,7 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int       m,
 
    hypre_create_ija(m, d_rc, d_ic, &d_jc, &d_c, &nnzC_nume);
 
-   HYPRE_CUDA_LAUNCH ( (hypre_spgemm_numeric<num_warps_per_block, shmem_hash_size, !exact_rownnz, hash_type>),
+   HYPRE_GPU_LAUNCH ( (hypre_spgemm_numeric<num_warps_per_block, shmem_hash_size, !exact_rownnz, hash_type>),
                         gDim, bDim, /* shmem_size, */
                         m, /* k, n, */ d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_ic, d_jc, d_c, d_rc,
                         d_ghash_i, d_ghash_j, d_ghash_a );
@@ -486,7 +486,7 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int       m,
 
          /* copy to the final C */
          dim3 gDim( (m + bDim.z - 1) / bDim.z );
-         HYPRE_CUDA_LAUNCH( (hypre_spgemm_copy_from_Cext_into_C<num_warps_per_block>), gDim, bDim,
+         HYPRE_GPU_LAUNCH( (hypre_spgemm_copy_from_Cext_into_C<num_warps_per_block>), gDim, bDim,
                             m, d_ic, d_jc, d_c, d_ic_new, d_jc_new, d_c_new );
 
          hypre_TFree(d_ic, HYPRE_MEMORY_DEVICE);
diff --git a/src/seq_mv/csr_spgemm_device_rowbound.c b/src/seq_mv/csr_spgemm_device_rowbound.c
index d3dce3e62c..c2703eb00e 100644
--- a/src/seq_mv/csr_spgemm_device_rowbound.c
+++ b/src/seq_mv/csr_spgemm_device_rowbound.c
@@ -311,17 +311,17 @@ hypre_spgemm_rownnz_attempt(HYPRE_Int  m,
     * ---------------------------------------------------------------------------*/
    if (hash_type == 'L')
    {
-      HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'L'>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'L'>), gDim, bDim,
                          m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf );
    }
    else if (hash_type == 'Q')
    {
-      HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'Q'>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'Q'>), gDim, bDim,
                          m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf );
    }
    else if (hash_type == 'D')
    {
-      HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'D'>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'D'>), gDim, bDim,
                          m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf );
    }
    else
diff --git a/src/seq_mv/csr_spgemm_device_rowest.c b/src/seq_mv/csr_spgemm_device_rowest.c
index f058d1d7ed..d94744bcfc 100644
--- a/src/seq_mv/csr_spgemm_device_rowest.c
+++ b/src/seq_mv/csr_spgemm_device_rowest.c
@@ -284,11 +284,11 @@ void csr_spmm_rownnz_cohen(HYPRE_Int M, HYPRE_Int K, HYPRE_Int N, HYPRE_Int *d_i
 
    dim3 gDim( (nsamples * N + bDim.z * HYPRE_WARP_SIZE - 1) / (bDim.z * HYPRE_WARP_SIZE) );
 
-   HYPRE_CUDA_LAUNCH( expdistfromuniform, gDim, bDim, nsamples * N, d_V1 );
+   HYPRE_GPU_LAUNCH( expdistfromuniform, gDim, bDim, nsamples * N, d_V1 );
 
    /* step-1: layer 3-2 */
    gDim.x = (K + bDim.z - 1) / bDim.z;
-   HYPRE_CUDA_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 2>), gDim, bDim,
+   HYPRE_GPU_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 2>), gDim, bDim,
                       K, d_ib, d_jb, d_V1, d_V2, NULL, nsamples, NULL, NULL, -1.0);
 
    //hypre_TFree(d_V1, HYPRE_MEMORY_DEVICE);
@@ -297,7 +297,7 @@ void csr_spmm_rownnz_cohen(HYPRE_Int M, HYPRE_Int K, HYPRE_Int N, HYPRE_Int *d_i
    d_V3 = (T*) d_rc;
 
    gDim.x = (M + bDim.z - 1) / bDim.z;
-   HYPRE_CUDA_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 1>), gDim, bDim,
+   HYPRE_GPU_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 1>), gDim, bDim,
                       M, d_ia, d_ja, d_V2, d_V3, d_rc, nsamples, d_low, d_upp, mult_factor);
 
    /* done */
@@ -331,13 +331,13 @@ hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n,
    if (row_est_mtd == 1)
    {
       /* naive overestimate */
-      HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'U', num_warps_per_block>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'U', num_warps_per_block>), gDim, bDim,
                          m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, NULL, d_rc );
    }
    else if (row_est_mtd == 2)
    {
       /* naive underestimate */
-      HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'L', num_warps_per_block>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'L', num_warps_per_block>), gDim, bDim,
                          m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, d_rc, NULL );
    }
    else if (row_est_mtd == 3)
@@ -354,7 +354,7 @@ hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n,
       HYPRE_Int *d_low = d_low_upp;
       HYPRE_Int *d_upp = d_low_upp + m;
 
-      HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'B', num_warps_per_block>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'B', num_warps_per_block>), gDim, bDim,
                          m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, d_low, d_upp );
 
       /* Cohen's algorithm, stochastic approach */
diff --git a/src/seq_mv/csr_spgemm_device_util.c b/src/seq_mv/csr_spgemm_device_util.c
index 9514be1f1a..a3cf7cd951 100644
--- a/src/seq_mv/csr_spgemm_device_util.c
+++ b/src/seq_mv/csr_spgemm_device_util.c
@@ -103,14 +103,14 @@ hypre_SpGemmCreateGlobalHashTable( HYPRE_Int       num_rows,        /* number of
    {
       ghash_i = hypre_TAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE);
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_ghash, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim,
                          num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE );
    }
    else if (type == 2)
    {
       ghash_i = hypre_CTAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE);
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim,
                          num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE );
    }
 
diff --git a/src/seq_mv/csr_spmv_device.c b/src/seq_mv/csr_spmv_device.c
index ba0a185761..bfe691669e 100644
--- a/src/seq_mv/csr_spmv_device.c
+++ b/src/seq_mv/csr_spmv_device.c
@@ -170,7 +170,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int      nrows,
       const HYPRE_Int group_size = 32;
       const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
       const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
-      HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
                          nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
    }
    else if (rownnz >= 32)
@@ -178,7 +178,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int      nrows,
       const HYPRE_Int group_size = 16;
       const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
       const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
-      HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
                          nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
    }
    else if (rownnz >= 16)
@@ -186,7 +186,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int      nrows,
       const HYPRE_Int group_size = 8;
       const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
       const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
-      HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
                          nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
    }
    else if (rownnz >= 8)
@@ -194,7 +194,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int      nrows,
       const HYPRE_Int group_size = 4;
       const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
       const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
-      HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
                          nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
    }
    else
@@ -202,7 +202,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int      nrows,
       const HYPRE_Int group_size = 4;
       const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
       const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
-      HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
                          nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
    }
 
diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index 5224d03cab..72edfc6527 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -797,7 +797,7 @@ BoxLoopforall( HYPRE_Int length,
       const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
-      HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
+      HYPRE_GPU_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
    }
 }
 
@@ -858,7 +858,7 @@ ReductionBoxLoopforall( HYPRE_Int  length,
       hypre_printf("length= %d, blocksize = %d, gridsize = %d\n", length, bDim.x, gDim.x);
       */
 
-      HYPRE_CUDA_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
+      HYPRE_GPU_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
    }
 }
 
diff --git a/src/struct_mv/boxloop_cuda.h b/src/struct_mv/boxloop_cuda.h
index cd477fe2eb..e78234c6d4 100644
--- a/src/struct_mv/boxloop_cuda.h
+++ b/src/struct_mv/boxloop_cuda.h
@@ -73,7 +73,7 @@ BoxLoopforall( HYPRE_Int length,
       const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
-      HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
+      HYPRE_GPU_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
    }
 }
 
@@ -134,7 +134,7 @@ ReductionBoxLoopforall( HYPRE_Int  length,
       hypre_printf("length= %d, blocksize = %d, gridsize = %d\n", length, bDim.x, gDim.x);
       */
 
-      HYPRE_CUDA_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
+      HYPRE_GPU_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
    }
 }
 
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index b8addbad0b..8c1ff91322 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -449,7 +449,7 @@ using namespace thrust::placeholders;
 #define GPU_LAUNCH_SYNC
 #endif // defined(HYPRE_DEBUG)
 
-#define HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)                                                 \
+#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)                                                 \
 {                                                                                                                             \
    if ( gridsize.x  == 0 || gridsize.y  == 0 || gridsize.z  == 0 ||                                                           \
         blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 )                                                            \
@@ -460,22 +460,22 @@ using namespace thrust::placeholders;
    }                                                                                                                          \
    else                                                                                                                       \
    {                                                                                                                          \
-      (kernel_name) <<< (gridsize), (blocksize), shmem_size, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
+      (kernel_name) <<< (gridsize), (blocksize), shmem_size, hypre_DeviceDataComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
       GPU_LAUNCH_SYNC;                                                                                                        \
    }                                                                                                                          \
 }
 
-#define HYPRE_CUDA_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
+#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
 
 /* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right
  * The following one works OK for now */
 
 #if defined(HYPRE_USING_CUDA)
 #define HYPRE_THRUST_CALL(func_name, ...) \
-   thrust::func_name(thrust::cuda::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::cuda::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_DeviceDataComputeStream(hypre_handle())), __VA_ARGS__);
 #elif defined(HYPRE_USING_HIP)
 #define HYPRE_THRUST_CALL(func_name, ...) \
-   thrust::func_name(thrust::hip::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::hip::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_DeviceDataComputeStream(hypre_handle())), __VA_ARGS__);
 #endif
 
 /* return the number of threads in block */
@@ -1040,6 +1040,451 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
 
 #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(HYPRE_USING_SYCL)
+
+#define PSTL_USE_PARALLEL_POLICIES 0 // for libstdc++ 9
+#define _GLIBCXX_USE_TBB_PAR_BACKEND 0 // for libstdc++ 10
+
+// #include <oneapi/dpl/execution>
+// #include <oneapi/dpl/algorithm>
+// #include <oneapi/dpl/iterator>
+// #include <oneapi/dpl/functional>
+
+//#include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if
+
+// #include <algorithm>
+// #include <numeric>
+// #include <functional>
+// #include <iterator>
+
+#define __forceinline__ __inline__ __attribute__((always_inline))
+
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ * macro for launching SYCL kernels, SYCL, oneDPL, oneMKL calls
+ *                    NOTE: IN HYPRE'S DEFAULT STREAM
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ */
+
+#if defined(HYPRE_DEBUG)
+#if defined(HYPRE_USING_CUDA)
+#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
+#endif
+#else // #if defined(HYPRE_DEBUG)
+#define GPU_LAUNCH_SYNC
+#endif // defined(HYPRE_DEBUG)
+
+#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...)                              \
+{                                                                                            \
+   if ( gridsize[0] == 0 || blocksize[0] == 0 )                                              \
+   {                                                                                         \
+     hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n",   \
+                  __FILE__, __LINE__,                                                        \
+                  gridsize[0], blocksize[0]);                                                \
+     assert(0); exit(1);                                                                     \
+   }                                                                                         \
+   else                                                                                      \
+   {                                                                                         \
+     hypre_DeviceDataComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \
+        [=] (sycl::nd_item<1> item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]] {        \
+           (kernel_name)(item, __VA_ARGS__);                                                 \
+     });                                                                                     \
+   }                                                                                         \
+}
+
+/* RL: TODO Want macro HYPRE_ONEDPL_CALL to return value but I don't know how to do it right
+ * The following one works OK for now */
+
+#define HYPRE_ONEDPL_CALL(func_name, ...) \
+  func_name(oneapi::dpl::execution::make_device_policy(*hypre_DeviceDataComputeStream(hypre_handle()), __VA_ARGS__);
+
+// /* return the number of threads in block */
+// template <hypre_int dim>
+// static __forceinline__
+// hypre_int hypre_gpu_get_num_threads()
+// {
+//    switch (dim)
+//    {
+//       case 1:
+//          return (blockDim.x);
+//       case 2:
+//          return (blockDim.x * blockDim.y);
+//       case 3:
+//          return (blockDim.x * blockDim.y * blockDim.z);
+//    }
+
+//    return -1;
+// }
+
+/* return the number of (sub_groups) warps in (work-group) block */
+template <hypre_int dim>
+static __forceinline__
+hypre_int hypre_gpu_get_num_warps(sycl::nd_item<1>& item)
+{
+   return item.get_sub_group().get_group_range().get(0);
+}
+
+/* return the thread lane id in warp */
+template <hypre_int dim>
+static __forceinline__
+hypre_int hypre_gpu_get_lane_id(sycl::nd_item<1>& item)
+{
+   return item.get_local_linear_id() & (HYPRE_WARP_SIZE-1);
+}
+
+// /* return the number of threads in grid */
+// template <hypre_int bdim, hypre_int gdim>
+// static __forceinline__
+// hypre_int hypre_gpu_get_grid_num_threads()
+// {
+//    return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_threads<bdim>();
+// }
+
+/* return the flattened work-item/thread id in global work space */
+template <hypre_int bdim, hypre_int gdim>
+static __forceinline__
+hypre_int hypre_gpu_get_grid_thread_id(sycl::nd_item<1>& item)
+{
+   return item.get_global_id(0);
+}
+
+// /* return the number of warps in grid */
+// template <hypre_int bdim, hypre_int gdim>
+// static __forceinline__
+// hypre_int hypre_gpu_get_grid_num_warps()
+// {
+//    return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_warps<bdim>();
+// }
+
+/* return the flattened warp id in grid */
+template <hypre_int bdim, hypre_int gdim>
+static __forceinline__
+hypre_int hypre_gpu_get_grid_warp_id(sycl::nd_item<1>& item)
+{
+  return item.get_group(0) * hypre_gpu_get_num_warps<bdim>(item) +
+     item.get_sub_group().get_group_linear_id();
+}
+
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+// static __forceinline__
+// hypre_double atomicAdd(hypre_double* address, hypre_double val)
+// {
+//     hypre_ulonglongint* address_as_ull = (hypre_ulonglongint*) address;
+//     hypre_ulonglongint old = *address_as_ull, assumed;
+
+//     do {
+//         assumed = old;
+//         old = atomicCAS(address_as_ull, assumed,
+//                         __double_as_longlong(val +
+//                                __longlong_as_double(assumed)));
+
+//     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+//     } while (assumed != old);
+
+//     return __longlong_as_double(old);
+// }
+// #endif
+
+template <typename T>
+static __forceinline__
+T read_only_load( const T *ptr )
+{
+   return *ptr;
+}
+
+// /* exclusive prefix scan */
+// template <typename T>
+// static __forceinline__
+// T warp_prefix_sum(hypre_int lane_id, T in, T &all_sum)
+// {
+// #pragma unroll
+//    for (hypre_int d = 2; d <=HYPRE_WARP_SIZE; d <<= 1)
+//    {
+//       T t = __shfl_up_sync(HYPRE_WARP_FULL_MASK, in, d >> 1);
+//       if ( (lane_id & (d - 1)) == (d - 1) )
+//       {
+//          in += t;
+//       }
+//    }
+
+//    all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, HYPRE_WARP_SIZE-1);
+
+//    if (lane_id == HYPRE_WARP_SIZE-1)
+//    {
+//       in = 0;
+//    }
+
+// #pragma unroll
+//    for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//    {
+//       T t = __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);
+
+//       if ( (lane_id & (d - 1)) == (d - 1))
+//       {
+//         if ( (lane_id & ((d << 1) - 1)) == ((d << 1) - 1) )
+//          {
+//             in += t;
+//          }
+//          else
+//          {
+//             in = t;
+//          }
+//       }
+//    }
+//    return in;
+// }
+
+template <typename T>
+static __forceinline__
+T warp_reduce_sum(T in, sycl::nd_item<1>& item)
+{
+  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  //sycl::ext::oneapi::reduce(SG, in, std::plus<T>());
+#pragma unroll
+  for (hypre_int d = SG.get_local_range().get(0)/2; d > 0; d >>= 1)
+  {
+    in += SG.shuffle_down(in, d);
+  }
+  return in;  
+}
+
+// template <typename T>
+// static __forceinline__
+// T warp_allreduce_sum(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in += __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_reduce_max(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = max(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_allreduce_max(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = max(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_reduce_min(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = min(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_allreduce_min(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = min(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// static __forceinline__
+// hypre_int next_power_of_2(hypre_int n)
+// {
+//    if (n <= 0)
+//    {
+//       return 0;
+//    }
+
+//    /* if n is power of 2, return itself */
+//    if ( (n & (n - 1)) == 0 )
+//    {
+//       return n;
+//    }
+
+//    n |= (n >>  1);
+//    n |= (n >>  2);
+//    n |= (n >>  4);
+//    n |= (n >>  8);
+//    n |= (n >> 16);
+//    n ^= (n >>  1);
+//    n  = (n <<  1);
+
+//    return n;
+// }
+
+// template<typename T>
+// struct absolute_value : public thrust::unary_function<T,T>
+// {
+//   T operator()(const T &x) const
+//   {
+//     return x < T(0) ? -x : x;
+//   }
+// };
+
+// template<typename T1, typename T2>
+// struct TupleComp2
+// {
+//    typedef thrust::tuple<T1, T2> Tuple;
+
+//    bool operator()(const Tuple& t1, const Tuple& t2)
+//    {
+//       if (thrust::get<0>(t1) < thrust::get<0>(t2))
+//       {
+//          return true;
+//       }
+//       if (thrust::get<0>(t1) > thrust::get<0>(t2))
+//       {
+//          return false;
+//       }
+//       return hypre_abs(thrust::get<1>(t1)) > hypre_abs(thrust::get<1>(t2));
+//    }
+// };
+
+// template<typename T1, typename T2>
+// struct TupleComp3
+// {
+//    typedef thrust::tuple<T1, T2> Tuple;
+
+//    bool operator()(const Tuple& t1, const Tuple& t2)
+//    {
+//       if (thrust::get<0>(t1) < thrust::get<0>(t2))
+//       {
+//          return true;
+//       }
+//       if (thrust::get<0>(t1) > thrust::get<0>(t2))
+//       {
+//          return false;
+//       }
+//       if (thrust::get<0>(t2) == thrust::get<1>(t2))
+//       {
+//          return false;
+//       }
+//       return thrust::get<0>(t1) == thrust::get<1>(t1) || thrust::get<1>(t1) < thrust::get<1>(t2);
+//    }
+// };
+
+// template<typename T>
+// struct is_negative : public thrust::unary_function<T,bool>
+// {
+//    bool operator()(const T &x)
+//    {
+//       return (x < 0);
+//    }
+// };
+
+// template<typename T>
+// struct is_positive : public thrust::unary_function<T,bool>
+// {
+//    bool operator()(const T &x)
+//    {
+//       return (x > 0);
+//    }
+// };
+
+// template<typename T>
+// struct is_nonnegative : public thrust::unary_function<T,bool>
+// {
+//    bool operator()(const T &x)
+//    {
+//       return (x >= 0);
+//    }
+// };
+
+template<typename T>
+struct in_range : public std::unary_function<T, bool>
+{
+   T low, up;
+
+   in_range(T low_, T up_) { low = low_; up = up_; }
+
+   bool operator()(const T &x) const 
+   {
+      return (x >= low && x <= up);
+   }
+};
+
+// template<typename T>
+// struct out_of_range : public thrust::unary_function<T,bool>
+// {
+//    T low, up;
+
+//    out_of_range(T low_, T up_) { low = low_; up = up_; }
+
+//    bool operator()(const T &x)
+//    {
+//       return (x < low || x > up);
+//    }
+// };
+
+template<typename T>
+struct less_than : std::unary_function<T, bool>
+{
+   T val;
+   less_than(T val_) { val = val_; }
+   
+   bool operator()(const T &x) const { return (x < val); }
+};
+
+// template<typename T>
+// struct modulo : public thrust::unary_function<T,T>
+// {
+//    T val;
+
+//    modulo(T val_) { val = val_; }
+
+//    T operator()(const T &x)
+//    {
+//       return (x % val);
+//    }
+// };
+
+// template<typename T>
+// struct equal : public thrust::unary_function<T,bool>
+// {
+//    T val;
+
+//    equal(T val_) { val = val_; }
+
+//    bool operator()(const T &x)
+//    {
+//       return (x == val);
+//    }
+// };
+
+// struct print_functor
+// {
+//    void operator()(HYPRE_Real val)
+//    {
+//       printf("%f\n", val);
+//    }
+// };
+
+#endif // #if defined(HYPRE_USING_SYCL)
+
+////////////////////////////////////////////////////////////////////////////////////////
+
 #if defined(HYPRE_USING_CUSPARSE)
 
 cudaDataType hypre_HYPREComplexToCudaDataType();
@@ -1315,7 +1760,7 @@ struct ReduceSum
          /* 2nd reduction with only *one* block */
          hypre_assert(nblocks >= 0 && nblocks <= 1024);
          const dim3 gDim(1), bDim(1024);
-         HYPRE_CUDA_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
+         HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
          hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
          val += init;
       }
diff --git a/src/utilities/device_reducer.h b/src/utilities/device_reducer.h
index 729bbce535..8953dec5d3 100644
--- a/src/utilities/device_reducer.h
+++ b/src/utilities/device_reducer.h
@@ -264,7 +264,7 @@ struct ReduceSum
          /* 2nd reduction with only *one* block */
          hypre_assert(nblocks >= 0 && nblocks <= 1024);
          const dim3 gDim(1), bDim(1024);
-         HYPRE_CUDA_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
+         HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
          hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
          val += init;
       }
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index b1bb63252b..d108ba9041 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -83,7 +83,7 @@ void hypre_CudaCompileFlagCheck()
    //cuda_arch_compile_d = hypre_TAlloc(hypre_int, 1, HYPRE_MEMORY_DEVICE);
    HYPRE_CUDA_CALL( cudaMalloc(&cuda_arch_compile_d, sizeof(hypre_int)) );
    hypre_TMemcpy(cuda_arch_compile_d, &cuda_arch_compile, hypre_int, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d );
    hypre_TMemcpy(&cuda_arch_compile, cuda_arch_compile_d, hypre_int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
    //hypre_TFree(cuda_arch_compile_d, HYPRE_MEMORY_DEVICE);
    HYPRE_CUDA_CALL( cudaFree(cuda_arch_compile_d) );
@@ -190,7 +190,7 @@ hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_di
       return hypre_error_flag;
    }
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia, d_offd_ia, d_rownnz );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia, d_offd_ia, d_rownnz );
 
    return hypre_error_flag;
 }
@@ -329,7 +329,7 @@ hypreDevice_CopyParCSRRows(HYPRE_Int      nrows,
    }
    */
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim,
                       nrows, d_row_indices, has_offd, first_col, d_col_map_offd_A,
                       d_diag_i, d_diag_j, d_diag_a,
                       d_offd_i, d_offd_j, d_offd_a,
@@ -533,7 +533,7 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea
       /* trivial cases, n = 1, 2 */
       dim3 bDim = 1;
       dim3 gDim = 1;
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y );
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y );
    }
    else
    {
@@ -572,7 +572,7 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(reduced_n, "thread", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim,
                          reduced_n, x, reduced_map, reduced_y );
 
       if (!work)
@@ -615,7 +615,7 @@ hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v)
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v );
 
    return hypre_error_flag;
 }
@@ -647,7 +647,7 @@ hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Comple
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y );
 
    return hypre_error_flag;
 }
@@ -679,7 +679,7 @@ hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
 
    return hypre_error_flag;
 }
@@ -716,7 +716,7 @@ hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y );
 
    return hypre_error_flag;
 }
@@ -749,7 +749,7 @@ hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z );
 
    return hypre_error_flag;
 }
@@ -773,7 +773,7 @@ hypreDevice_BigToSmallCopy(HYPRE_Int *tgt, const HYPRE_BigInt *src, HYPRE_Int si
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(size, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size);
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size);
 
    return hypre_error_flag;
 }
@@ -1231,7 +1231,7 @@ hypre_DeviceDataCreate()
    /* WM: does the default selector get a GPU if available? Having trouble with getting the device on frank, so temporarily just passing the default selector */
    hypre_DeviceDataDevice(data)            = nullptr;
 
-   hypre_DeviceDataDeviceMaxWorkGroupSize(data) = hypre_DeviceDataDevice(data).get_info<sycl::info::device::max_work_group_size>();
+   hypre_DeviceDataDeviceMaxWorkGroupSize(data) = hypre_DeviceDataDevice(data)->get_info<sycl::info::device::max_work_group_size>();
 #else
    hypre_DeviceDataDevice(data)            = 0;
 #endif
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index e4e137ca14..96d14a1435 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -392,7 +392,7 @@ using namespace thrust::placeholders;
 #define GPU_LAUNCH_SYNC
 #endif // defined(HYPRE_DEBUG)
 
-#define HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)                                                 \
+#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)                                                 \
 {                                                                                                                             \
    if ( gridsize.x  == 0 || gridsize.y  == 0 || gridsize.z  == 0 ||                                                           \
         blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 )                                                            \
@@ -403,22 +403,22 @@ using namespace thrust::placeholders;
    }                                                                                                                          \
    else                                                                                                                       \
    {                                                                                                                          \
-      (kernel_name) <<< (gridsize), (blocksize), shmem_size, hypre_HandleCudaComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
+      (kernel_name) <<< (gridsize), (blocksize), shmem_size, hypre_DeviceDataComputeStream(hypre_handle()) >>> (__VA_ARGS__); \
       GPU_LAUNCH_SYNC;                                                                                                        \
    }                                                                                                                          \
 }
 
-#define HYPRE_CUDA_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
+#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
 
 /* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right
  * The following one works OK for now */
 
 #if defined(HYPRE_USING_CUDA)
 #define HYPRE_THRUST_CALL(func_name, ...) \
-   thrust::func_name(thrust::cuda::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::cuda::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_DeviceDataComputeStream(hypre_handle())), __VA_ARGS__);
 #elif defined(HYPRE_USING_HIP)
 #define HYPRE_THRUST_CALL(func_name, ...) \
-   thrust::func_name(thrust::hip::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_HandleCudaComputeStream(hypre_handle())), __VA_ARGS__);
+   thrust::func_name(thrust::hip::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_DeviceDataComputeStream(hypre_handle())), __VA_ARGS__);
 #endif
 
 /* return the number of threads in block */
@@ -983,6 +983,451 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
 
 #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(HYPRE_USING_SYCL)
+
+#define PSTL_USE_PARALLEL_POLICIES 0 // for libstdc++ 9
+#define _GLIBCXX_USE_TBB_PAR_BACKEND 0 // for libstdc++ 10
+
+// #include <oneapi/dpl/execution>
+// #include <oneapi/dpl/algorithm>
+// #include <oneapi/dpl/iterator>
+// #include <oneapi/dpl/functional>
+
+//#include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if
+
+// #include <algorithm>
+// #include <numeric>
+// #include <functional>
+// #include <iterator>
+
+#define __forceinline__ __inline__ __attribute__((always_inline))
+
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ * macro for launching SYCL kernels, SYCL, oneDPL, oneMKL calls
+ *                    NOTE: IN HYPRE'S DEFAULT STREAM
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ */
+
+#if defined(HYPRE_DEBUG)
+#if defined(HYPRE_USING_CUDA)
+#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
+#endif
+#else // #if defined(HYPRE_DEBUG)
+#define GPU_LAUNCH_SYNC
+#endif // defined(HYPRE_DEBUG)
+
+#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...)                              \
+{                                                                                            \
+   if ( gridsize[0] == 0 || blocksize[0] == 0 )                                              \
+   {                                                                                         \
+     hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n",   \
+                  __FILE__, __LINE__,                                                        \
+                  gridsize[0], blocksize[0]);                                                \
+     assert(0); exit(1);                                                                     \
+   }                                                                                         \
+   else                                                                                      \
+   {                                                                                         \
+     hypre_DeviceDataComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \
+        [=] (sycl::nd_item<1> item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]] {        \
+           (kernel_name)(item, __VA_ARGS__);                                                 \
+     });                                                                                     \
+   }                                                                                         \
+}
+
+/* RL: TODO Want macro HYPRE_ONEDPL_CALL to return value but I don't know how to do it right
+ * The following one works OK for now */
+
+#define HYPRE_ONEDPL_CALL(func_name, ...) \
+  func_name(oneapi::dpl::execution::make_device_policy(*hypre_DeviceDataComputeStream(hypre_handle()), __VA_ARGS__);
+
+// /* return the number of threads in block */
+// template <hypre_int dim>
+// static __forceinline__
+// hypre_int hypre_gpu_get_num_threads()
+// {
+//    switch (dim)
+//    {
+//       case 1:
+//          return (blockDim.x);
+//       case 2:
+//          return (blockDim.x * blockDim.y);
+//       case 3:
+//          return (blockDim.x * blockDim.y * blockDim.z);
+//    }
+
+//    return -1;
+// }
+
+/* return the number of (sub_groups) warps in (work-group) block */
+template <hypre_int dim>
+static __forceinline__
+hypre_int hypre_gpu_get_num_warps(sycl::nd_item<1>& item)
+{
+   return item.get_sub_group().get_group_range().get(0);
+}
+
+/* return the thread lane id in warp */
+template <hypre_int dim>
+static __forceinline__
+hypre_int hypre_gpu_get_lane_id(sycl::nd_item<1>& item)
+{
+   return item.get_local_linear_id() & (HYPRE_WARP_SIZE-1);
+}
+
+// /* return the number of threads in grid */
+// template <hypre_int bdim, hypre_int gdim>
+// static __forceinline__
+// hypre_int hypre_gpu_get_grid_num_threads()
+// {
+//    return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_threads<bdim>();
+// }
+
+/* return the flattened work-item/thread id in global work space */
+template <hypre_int bdim, hypre_int gdim>
+static __forceinline__
+hypre_int hypre_gpu_get_grid_thread_id(sycl::nd_item<1>& item)
+{
+   return item.get_global_id(0);
+}
+
+// /* return the number of warps in grid */
+// template <hypre_int bdim, hypre_int gdim>
+// static __forceinline__
+// hypre_int hypre_gpu_get_grid_num_warps()
+// {
+//    return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_warps<bdim>();
+// }
+
+/* return the flattened warp id in grid */
+template <hypre_int bdim, hypre_int gdim>
+static __forceinline__
+hypre_int hypre_gpu_get_grid_warp_id(sycl::nd_item<1>& item)
+{
+  return item.get_group(0) * hypre_gpu_get_num_warps<bdim>(item) +
+     item.get_sub_group().get_group_linear_id();
+}
+
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+// static __forceinline__
+// hypre_double atomicAdd(hypre_double* address, hypre_double val)
+// {
+//     hypre_ulonglongint* address_as_ull = (hypre_ulonglongint*) address;
+//     hypre_ulonglongint old = *address_as_ull, assumed;
+
+//     do {
+//         assumed = old;
+//         old = atomicCAS(address_as_ull, assumed,
+//                         __double_as_longlong(val +
+//                                __longlong_as_double(assumed)));
+
+//     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+//     } while (assumed != old);
+
+//     return __longlong_as_double(old);
+// }
+// #endif
+
+template <typename T>
+static __forceinline__
+T read_only_load( const T *ptr )
+{
+   return *ptr;
+}
+
+// /* exclusive prefix scan */
+// template <typename T>
+// static __forceinline__
+// T warp_prefix_sum(hypre_int lane_id, T in, T &all_sum)
+// {
+// #pragma unroll
+//    for (hypre_int d = 2; d <=HYPRE_WARP_SIZE; d <<= 1)
+//    {
+//       T t = __shfl_up_sync(HYPRE_WARP_FULL_MASK, in, d >> 1);
+//       if ( (lane_id & (d - 1)) == (d - 1) )
+//       {
+//          in += t;
+//       }
+//    }
+
+//    all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, HYPRE_WARP_SIZE-1);
+
+//    if (lane_id == HYPRE_WARP_SIZE-1)
+//    {
+//       in = 0;
+//    }
+
+// #pragma unroll
+//    for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//    {
+//       T t = __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);
+
+//       if ( (lane_id & (d - 1)) == (d - 1))
+//       {
+//         if ( (lane_id & ((d << 1) - 1)) == ((d << 1) - 1) )
+//          {
+//             in += t;
+//          }
+//          else
+//          {
+//             in = t;
+//          }
+//       }
+//    }
+//    return in;
+// }
+
+template <typename T>
+static __forceinline__
+T warp_reduce_sum(T in, sycl::nd_item<1>& item)
+{
+  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  //sycl::ext::oneapi::reduce(SG, in, std::plus<T>());
+#pragma unroll
+  for (hypre_int d = SG.get_local_range().get(0)/2; d > 0; d >>= 1)
+  {
+    in += SG.shuffle_down(in, d);
+  }
+  return in;  
+}
+
+// template <typename T>
+// static __forceinline__
+// T warp_allreduce_sum(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in += __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_reduce_max(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = max(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_allreduce_max(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = max(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_reduce_min(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = min(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_allreduce_min(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = min(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// static __forceinline__
+// hypre_int next_power_of_2(hypre_int n)
+// {
+//    if (n <= 0)
+//    {
+//       return 0;
+//    }
+
+//    /* if n is power of 2, return itself */
+//    if ( (n & (n - 1)) == 0 )
+//    {
+//       return n;
+//    }
+
+//    n |= (n >>  1);
+//    n |= (n >>  2);
+//    n |= (n >>  4);
+//    n |= (n >>  8);
+//    n |= (n >> 16);
+//    n ^= (n >>  1);
+//    n  = (n <<  1);
+
+//    return n;
+// }
+
+// template<typename T>
+// struct absolute_value : public thrust::unary_function<T,T>
+// {
+//   T operator()(const T &x) const
+//   {
+//     return x < T(0) ? -x : x;
+//   }
+// };
+
+// template<typename T1, typename T2>
+// struct TupleComp2
+// {
+//    typedef thrust::tuple<T1, T2> Tuple;
+
+//    bool operator()(const Tuple& t1, const Tuple& t2)
+//    {
+//       if (thrust::get<0>(t1) < thrust::get<0>(t2))
+//       {
+//          return true;
+//       }
+//       if (thrust::get<0>(t1) > thrust::get<0>(t2))
+//       {
+//          return false;
+//       }
+//       return hypre_abs(thrust::get<1>(t1)) > hypre_abs(thrust::get<1>(t2));
+//    }
+// };
+
+// template<typename T1, typename T2>
+// struct TupleComp3
+// {
+//    typedef thrust::tuple<T1, T2> Tuple;
+
+//    bool operator()(const Tuple& t1, const Tuple& t2)
+//    {
+//       if (thrust::get<0>(t1) < thrust::get<0>(t2))
+//       {
+//          return true;
+//       }
+//       if (thrust::get<0>(t1) > thrust::get<0>(t2))
+//       {
+//          return false;
+//       }
+//       if (thrust::get<0>(t2) == thrust::get<1>(t2))
+//       {
+//          return false;
+//       }
+//       return thrust::get<0>(t1) == thrust::get<1>(t1) || thrust::get<1>(t1) < thrust::get<1>(t2);
+//    }
+// };
+
+// template<typename T>
+// struct is_negative : public thrust::unary_function<T,bool>
+// {
+//    bool operator()(const T &x)
+//    {
+//       return (x < 0);
+//    }
+// };
+
+// template<typename T>
+// struct is_positive : public thrust::unary_function<T,bool>
+// {
+//    bool operator()(const T &x)
+//    {
+//       return (x > 0);
+//    }
+// };
+
+// template<typename T>
+// struct is_nonnegative : public thrust::unary_function<T,bool>
+// {
+//    bool operator()(const T &x)
+//    {
+//       return (x >= 0);
+//    }
+// };
+
+template<typename T>
+struct in_range : public std::unary_function<T, bool>
+{
+   T low, up;
+
+   in_range(T low_, T up_) { low = low_; up = up_; }
+
+   bool operator()(const T &x) const 
+   {
+      return (x >= low && x <= up);
+   }
+};
+
+// template<typename T>
+// struct out_of_range : public thrust::unary_function<T,bool>
+// {
+//    T low, up;
+
+//    out_of_range(T low_, T up_) { low = low_; up = up_; }
+
+//    bool operator()(const T &x)
+//    {
+//       return (x < low || x > up);
+//    }
+// };
+
+template<typename T>
+struct less_than : std::unary_function<T, bool>
+{
+   T val;
+   less_than(T val_) { val = val_; }
+   
+   bool operator()(const T &x) const { return (x < val); }
+};
+
+// template<typename T>
+// struct modulo : public thrust::unary_function<T,T>
+// {
+//    T val;
+
+//    modulo(T val_) { val = val_; }
+
+//    T operator()(const T &x)
+//    {
+//       return (x % val);
+//    }
+// };
+
+// template<typename T>
+// struct equal : public thrust::unary_function<T,bool>
+// {
+//    T val;
+
+//    equal(T val_) { val = val_; }
+
+//    bool operator()(const T &x)
+//    {
+//       return (x == val);
+//    }
+// };
+
+// struct print_functor
+// {
+//    void operator()(HYPRE_Real val)
+//    {
+//       printf("%f\n", val);
+//    }
+// };
+
+#endif // #if defined(HYPRE_USING_SYCL)
+
+////////////////////////////////////////////////////////////////////////////////////////
+
 #if defined(HYPRE_USING_CUSPARSE)
 
 cudaDataType hypre_HYPREComplexToCudaDataType();
diff --git a/src/utilities/general.c b/src/utilities/general.c
index 0aed7d5252..2cfec7ab23 100644
--- a/src/utilities/general.c
+++ b/src/utilities/general.c
@@ -106,6 +106,8 @@ hypre_SetDevice(hypre_int device_id, hypre_Handle *hypre_handle_)
      hypre_printf("ERROR: SYCL device-ID exceed the number of devices on-node... \n");
    }
 
+   sycl::platform platform(sycl::gpu_selector{});
+   auto gpu_devices = platform.get_devices(sycl::info::device_type::gpu);
    HYPRE_Int local_nDevices=0;
    for (int i = 0; i < gpu_devices.size(); i++) {
      // multi-tile GPUs

From 35fa901d14c77be76efc1b1919bf93a400b64d8c Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Wed, 8 Dec 2021 19:57:24 +0000
Subject: [PATCH 35/44] [SYCL] changes to function, var names from _cuda_ to
 _device_ for unified

---
 .../distributed_matrix_parcsr.c               |  2 +-
 src/parcsr_ls/ams.c                           |  2 +-
 src/parcsr_ls/par_relax.c                     |  8 +--
 src/parcsr_ls/par_relax_more_device.c         |  4 +-
 src/parcsr_mv/par_csr_communication.c         |  2 +-
 src/parcsr_mv/par_csr_matop.c                 |  2 +-
 src/parcsr_mv/par_csr_matop_device.c          |  6 +--
 src/parcsr_mv/par_csr_matvec.c                | 16 +++---
 src/parcsr_mv/par_csr_triplemat_device.c      |  4 +-
 src/seq_mv/csr_matop_device.c                 | 46 ++++++++--------
 src/seq_mv/csr_matvec_device.c                |  4 +-
 src/seq_mv/csr_spgemm_device.c                | 14 ++---
 src/seq_mv/csr_sptrans_device.c               |  2 +-
 src/seq_mv/vector.c                           | 14 ++---
 src/sstruct_mv/sstruct_matrix.c               |  2 +-
 src/sstruct_mv/sstruct_vector.c               |  2 +-
 src/test/ij.c                                 | 10 ++--
 src/test/ij_assembly.c                        | 20 +++----
 src/test/ij_mm.c                              |  6 +--
 src/utilities/_hypre_utilities.h              | 12 ++---
 src/utilities/_hypre_utilities.hpp            |  6 +--
 src/utilities/device_utils.c                  | 52 ++++++++++---------
 src/utilities/device_utils.h                  |  6 +--
 src/utilities/int_array.c                     |  2 +-
 src/utilities/protos.h                        | 12 ++---
 25 files changed, 130 insertions(+), 126 deletions(-)

diff --git a/src/distributed_matrix/distributed_matrix_parcsr.c b/src/distributed_matrix/distributed_matrix_parcsr.c
index 0df9ae59e8..e6d986dddb 100644
--- a/src/distributed_matrix/distributed_matrix_parcsr.c
+++ b/src/distributed_matrix/distributed_matrix_parcsr.c
@@ -102,7 +102,7 @@ hypre_DistributedMatrixGetRowParCSR( hypre_DistributedMatrix *matrix,
 
    // RL: if HYPRE_ParCSRMatrixGetRow was on device, need the next line to guarantee it's done
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
    return(ierr);
diff --git a/src/parcsr_ls/ams.c b/src/parcsr_ls/ams.c
index 25549b54e9..9a90c0a71c 100644
--- a/src/parcsr_ls/ams.c
+++ b/src/parcsr_ls/ams.c
@@ -459,7 +459,7 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A)
    HYPRE_GPU_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
                      nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd);
 
-   //hypre_SyncCudaComputeStream(hypre_handle());
+   //hypre_SyncDeviceComputeStream(hypre_handle());
 
    return hypre_error_flag;
 }
diff --git a/src/parcsr_ls/par_relax.c b/src/parcsr_ls/par_relax.c
index 608bc4209d..63d6b7df03 100644
--- a/src/parcsr_ls/par_relax.c
+++ b/src/parcsr_ls/par_relax.c
@@ -1117,8 +1117,8 @@ hypre_BoomerAMGRelax7Jacobi( hypre_ParCSRMatrix *A,
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
    HYPRE_Int sync_stream;
-   hypre_GetSyncCudaCompute(&sync_stream);
-   hypre_SetSyncCudaCompute(0);
+   hypre_GetSyncDeviceCompute(&sync_stream);
+   hypre_SetSyncDeviceCompute(0);
 #endif
 
    /*-----------------------------------------------------------------
@@ -1144,8 +1144,8 @@ hypre_BoomerAMGRelax7Jacobi( hypre_ParCSRMatrix *A,
    }
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
-   hypre_SetSyncCudaCompute(sync_stream);
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SetSyncDeviceCompute(sync_stream);
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
    return hypre_error_flag;
diff --git a/src/parcsr_ls/par_relax_more_device.c b/src/parcsr_ls/par_relax_more_device.c
index c3cf1ce9fb..3388da1f82 100644
--- a/src/parcsr_ls/par_relax_more_device.c
+++ b/src/parcsr_ls/par_relax_more_device.c
@@ -169,7 +169,7 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A,
                      rowsums_upper,
                      scale);
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    e_min = HYPRE_THRUST_CALL(reduce, rowsums_lower, rowsums_lower + A_num_rows, (HYPRE_Real)0,
                              thrust::minimum<HYPRE_Real>());
@@ -323,7 +323,7 @@ hypre_ParCSRMaxEigEstimateCGDevice(hypre_ParCSRMatrix *A,     /* matrix to relax
    /* set residual to random */
    hypre_CurandUniform(local_size, r_data, 0, 0, 0, 0);
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    HYPRE_THRUST_CALL(transform,
                      r_data, r_data + local_size, r_data,
diff --git a/src/parcsr_mv/par_csr_communication.c b/src/parcsr_mv/par_csr_communication.c
index 35fef28c8d..9786d21d31 100644
--- a/src/parcsr_mv/par_csr_communication.c
+++ b/src/parcsr_mv/par_csr_communication.c
@@ -434,7 +434,7 @@ hypre_ParCSRCommHandleCreate_v2 ( HYPRE_Int            job,
    recv_data = recv_data_in;
    // TODO RL: it seems that we need to sync the CUDA stream before doing GPU-GPU MPI.
    // Need to check MPI documentation whether this is acutally true
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
    num_requests = num_sends + num_recvs;
diff --git a/src/parcsr_mv/par_csr_matop.c b/src/parcsr_mv/par_csr_matop.c
index 8eeb6dcf4c..97552f4aa1 100644
--- a/src/parcsr_mv/par_csr_matop.c
+++ b/src/parcsr_mv/par_csr_matop.c
@@ -4113,7 +4113,7 @@ hypre_ParTMatmul( hypre_ParCSRMatrix  *A,
    if ( hypre_GetExecPolicy2(memory_location_A, memory_location_B) == HYPRE_EXEC_DEVICE )
    {
       hypre_CSRMatrixMoveDiagFirstDevice(hypre_ParCSRMatrixDiag(C));
-      hypre_SyncCudaComputeStream(hypre_handle());
+      hypre_SyncDeviceComputeStream(hypre_handle());
    }
 #endif
 
diff --git a/src/parcsr_mv/par_csr_matop_device.c b/src/parcsr_mv/par_csr_matop_device.c
index 31bb4afb89..992dea4964 100644
--- a/src/parcsr_mv/par_csr_matop_device.c
+++ b/src/parcsr_mv/par_csr_matop_device.c
@@ -306,7 +306,7 @@ hypre_MergeDiagAndOffdDevice(hypre_ParCSRMatrix *A)
    hypre_CSRMatrixData(B) = B_a;
    hypre_CSRMatrixMemoryLocation(B) = HYPRE_MEMORY_DEVICE;
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    return B;
 }
@@ -1044,7 +1044,7 @@ hypre_ParCSRMatrixGetRowDevice( hypre_ParCSRMatrix  *mat,
       *values = hypre_ParCSRMatrixRowvalues(mat);
    }
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    return hypre_error_flag;
 }
@@ -1603,7 +1603,7 @@ hypre_ParCSRDiagScale( HYPRE_ParCSRMatrix HA,
    HYPRE_Int ierr = 0;
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
    hypreDevice_DiagScaleVector(local_size, A_i, A_data, y_data, 0.0, x_data);
-   //hypre_SyncCudaComputeStream(hypre_handle());
+   //hypre_SyncDeviceComputeStream(hypre_handle());
 #else /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
    HYPRE_Int i;
 #if defined(HYPRE_USING_DEVICE_OPENMP)
diff --git a/src/parcsr_mv/par_csr_matvec.c b/src/parcsr_mv/par_csr_matvec.c
index 30921fe960..d53f74a9d8 100644
--- a/src/parcsr_mv/par_csr_matvec.c
+++ b/src/parcsr_mv/par_csr_matvec.c
@@ -56,8 +56,8 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
 
 #if defined(HYPRE_USING_GPU)
    HYPRE_Int sync_stream;
-   hypre_GetSyncCudaCompute(&sync_stream);
-   hypre_SetSyncCudaCompute(0);
+   hypre_GetSyncDeviceCompute(&sync_stream);
+   hypre_SetSyncDeviceCompute(0);
 #endif
 
    HYPRE_ANNOTATE_FUNC_BEGIN;
@@ -348,8 +348,8 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
    }
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SetSyncCudaCompute(sync_stream);
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SetSyncDeviceCompute(sync_stream);
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -415,8 +415,8 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex       alpha,
 
 #if defined(HYPRE_USING_GPU)
    HYPRE_Int sync_stream;
-   hypre_GetSyncCudaCompute(&sync_stream);
-   hypre_SetSyncCudaCompute(0);
+   hypre_GetSyncDeviceCompute(&sync_stream);
+   hypre_SetSyncDeviceCompute(0);
 #endif
 
    HYPRE_ANNOTATE_FUNC_BEGIN;
@@ -724,8 +724,8 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex       alpha,
    }
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SetSyncCudaCompute(sync_stream);
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SetSyncDeviceCompute(sync_stream);
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
diff --git a/src/parcsr_mv/par_csr_triplemat_device.c b/src/parcsr_mv/par_csr_triplemat_device.c
index 0b8a67fd63..5c77572e04 100644
--- a/src/parcsr_mv/par_csr_triplemat_device.c
+++ b/src/parcsr_mv/par_csr_triplemat_device.c
@@ -497,7 +497,7 @@ hypre_ParCSRTMatMatKTDevice( hypre_ParCSRMatrix  *A,
 
    hypre_assert(!hypre_CSRMatrixCheckDiagFirstDevice(hypre_ParCSRMatrixDiag(C)));
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    return C;
 }
@@ -817,7 +817,7 @@ hypre_ParCSRMatrixRAPKTDevice( hypre_ParCSRMatrix *R,
 
    hypre_assert(!hypre_CSRMatrixCheckDiagFirstDevice(hypre_ParCSRMatrixDiag(C)));
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    return C;
 }
diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c
index 0410ff7474..ff79ec97f4 100644
--- a/src/seq_mv/csr_matop_device.c
+++ b/src/seq_mv/csr_matop_device.c
@@ -927,7 +927,7 @@ hypre_CSRMatrixAddDevice ( HYPRE_Complex    alpha,
   hypre_CSRMatrixData(C) = C_data;
   hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return C;
 }
@@ -950,7 +950,7 @@ hypre_CSRMatrixMultiplyDevice( hypre_CSRMatrix *A,
 
   hypreDevice_CSRSpGemm(A, B, &C);
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return C;
 }
@@ -1100,7 +1100,7 @@ hypre_CSRMatrixSplitDevice( hypre_CSRMatrix  *B_ext,
   *B_ext_diag_ptr = B_ext_diag;
   *B_ext_offd_ptr = B_ext_offd;
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return ierr;
 }
@@ -1331,7 +1331,7 @@ hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A,
   hypre_CSRMatrixData(C) = C_data;
   hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return C;
 }
@@ -1373,7 +1373,7 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix  *A,
   hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE);
   hypre_TFree(reduced_col_nnz,     HYPRE_MEMORY_DEVICE);
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return hypre_error_flag;
 }
@@ -1394,7 +1394,7 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
   HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim,
                    nrows, A_i, A_j, A_data);
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return hypre_error_flag;
 }
@@ -1421,7 +1421,7 @@ hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
 
   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return ierr;
 }
@@ -1466,7 +1466,7 @@ hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
 #endif
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return ierr;
 }
@@ -1506,7 +1506,7 @@ hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
 #endif
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return ierr;
 }
@@ -1629,7 +1629,7 @@ hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
                       row_sum, scal, set_or_add[0] == 's' );
   }
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 }
 
 void
@@ -1648,7 +1648,7 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
 
   HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 }
 
 /* return C = [A; B] */
@@ -1907,7 +1907,7 @@ hypre_CSRMatrixAddDevice ( HYPRE_Complex    alpha,
   hypre_CSRMatrixData(C) = C_data;
   hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return C;
 }
@@ -1930,7 +1930,7 @@ hypre_CSRMatrixMultiplyDevice( hypre_CSRMatrix *A,
 
   hypreDevice_CSRSpGemm(A, B, &C);
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return C;
 }
@@ -2080,7 +2080,7 @@ hypre_CSRMatrixSplitDevice( hypre_CSRMatrix  *B_ext,
   *B_ext_diag_ptr = B_ext_diag;
   *B_ext_offd_ptr = B_ext_offd;
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return ierr;
 }
@@ -2308,7 +2308,7 @@ hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A,
   hypre_CSRMatrixData(C) = C_data;
   hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return C;
 }
@@ -2354,7 +2354,7 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix  *A,
   hypre_TFree(reduced_col_nnz,     HYPRE_MEMORY_DEVICE);
   hypre_TFree(values,              HYPRE_MEMORY_UNIFIED);
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return hypre_error_flag;
 }
@@ -2375,7 +2375,7 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
   HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim,
                    nrows, A_i, A_j, A_data);
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return hypre_error_flag;
 }
@@ -2402,7 +2402,7 @@ hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
 
   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return ierr;
 }
@@ -2447,7 +2447,7 @@ hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
 #endif
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return ierr;
 }
@@ -2487,7 +2487,7 @@ hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
 #endif
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
   return ierr;
 }
@@ -2598,7 +2598,7 @@ hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
                       row_sum, scal, set_or_add[0] == 's' );
   }
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 }
 
 void
@@ -2617,7 +2617,7 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
 
   HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
 
-  hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 }
 
 /* return C = [A; B] */
@@ -2878,7 +2878,7 @@ hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix  *A,
 
    *AT_ptr = C;
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    return hypre_error_flag;
 }
diff --git a/src/seq_mv/csr_matvec_device.c b/src/seq_mv/csr_matvec_device.c
index 811040a510..8b61018ccd 100644
--- a/src/seq_mv/csr_matvec_device.c
+++ b/src/seq_mv/csr_matvec_device.c
@@ -117,7 +117,7 @@ hypre_CSRMatrixMatvecDevice( HYPRE_Int        trans,
       hypre_CSRMatrixMatvecDevice2(trans, alpha, A, x, beta, y, offset);
    }
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
    hypre_GpuProfilingPopRange();
@@ -201,7 +201,7 @@ hypre_CSRMatrixMatvecCusparseNewAPI( HYPRE_Int        trans,
 #endif
                                      dBuffer) );
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    if (trans)
    {
diff --git a/src/seq_mv/csr_spgemm_device.c b/src/seq_mv/csr_spgemm_device.c
index 7d44c2cd05..b4074dadb9 100644
--- a/src/seq_mv/csr_spgemm_device.c
+++ b/src/seq_mv/csr_spgemm_device.c
@@ -89,7 +89,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
 #endif
          hypreDevice_CSRSpGemmRownnz(m, k, n, d_ia, d_ja, d_ib, d_jb, 0 /* without input rc */, d_rc);
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("Rownnz time %f\n", t2);
 #endif
@@ -101,7 +101,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
                                                         1 /* exact row nnz */,
                                                         &d_ic, &d_jc, &d_c, &nnzC);
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("SpGemmNumerical time %f\n", t2);
 #endif
@@ -115,7 +115,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
 #endif
          hypreDevice_CSRSpGemmRownnzEstimate(m, k, n, d_ia, d_ja, d_ib, d_jb, d_rc);
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("RownnzEst time %f\n", t2);
 #endif
@@ -126,7 +126,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
          hypreDevice_CSRSpGemmNumerWithRownnzEstimate(m, k, n, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_rc,
                                                       &d_ic, &d_jc, &d_c, &nnzC);
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("SpGemmNumerical time %f\n", t2);
 #endif
@@ -140,7 +140,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
 #endif
          hypreDevice_CSRSpGemmRownnzEstimate(m, k, n, d_ia, d_ja, d_ib, d_jb, d_rc);
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("RownnzEst time %f\n", t2);
 #endif
@@ -157,7 +157,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
                                                       d_rc + 2 * m,
                                                       thrust::identity<HYPRE_Int>() );
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("RownnzBound time %f\n", t2);
 #endif
@@ -169,7 +169,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
                                                         rownnz_exact,
                                                         &d_ic, &d_jc, &d_c, &nnzC);
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("SpGemmNumerical time %f\n", t2);
 #endif
diff --git a/src/seq_mv/csr_sptrans_device.c b/src/seq_mv/csr_sptrans_device.c
index 548665ed2e..bd85778a03 100644
--- a/src/seq_mv/csr_sptrans_device.c
+++ b/src/seq_mv/csr_sptrans_device.c
@@ -137,7 +137,7 @@ hypreDevice_CSRSpTransRocsparse(HYPRE_Int   m,        HYPRE_Int   n,        HYPR
    *d_ac_out = csc_a;
 
 #ifdef HYPRE_PROFILE
-   hypre_SyncCudaDevice(hypre_handle())
+   hypre_SyncDevice(hypre_handle())
    hypre_profile_times[HYPRE_TIMER_ID_SPTRANS] += hypre_MPI_Wtime();
 #endif
 
diff --git a/src/seq_mv/vector.c b/src/seq_mv/vector.c
index 8b024f39c5..bfab868fbb 100644
--- a/src/seq_mv/vector.c
+++ b/src/seq_mv/vector.c
@@ -300,7 +300,7 @@ hypre_SeqVectorSetConstantValues( hypre_Vector *v,
 #endif /* defined(HYPRE_USING_CUDA)  || defined(HYPRE_USING_HIP) */
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -488,7 +488,7 @@ hypre_SeqVectorScale( HYPRE_Complex alpha,
 #endif /* defined(HYPRE_USING_CUDA)  || defined(HYPRE_USING_HIP) */
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -542,7 +542,7 @@ hypre_SeqVectorAxpy( HYPRE_Complex alpha,
 #endif /* defined(HYPRE_USING_CUDA)  || defined(HYPRE_USING_HIP) */
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -596,7 +596,7 @@ hypre_SeqVectorElmdivpy( hypre_Vector *x,
    }
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -647,7 +647,7 @@ hypre_SeqVectorElmdivpyMarked( hypre_Vector *x,
    }
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -704,7 +704,7 @@ hypre_SeqVectorInnerProd( hypre_Vector *x,
 #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -806,7 +806,7 @@ hypre_SeqVectorMax( HYPRE_Complex alpha,
 
 #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_BLAS1] += hypre_MPI_Wtime();
diff --git a/src/sstruct_mv/sstruct_matrix.c b/src/sstruct_mv/sstruct_matrix.c
index 1d9ce85366..e51066abcc 100644
--- a/src/sstruct_mv/sstruct_matrix.c
+++ b/src/sstruct_mv/sstruct_matrix.c
@@ -392,7 +392,7 @@ hypre_SStructPMatrixSetBoxValues( hypre_SStructPMatrix *pmatrix,
                                   values, action, -1, 0);
    /* TODO: Why need DeviceSync? */
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #endif
    /* set (AddTo/Get) or clear (Set) values outside the grid in ghost zones */
    if (action != 0)
diff --git a/src/sstruct_mv/sstruct_vector.c b/src/sstruct_mv/sstruct_vector.c
index fdeeae6421..fa8db02a35 100644
--- a/src/sstruct_mv/sstruct_vector.c
+++ b/src/sstruct_mv/sstruct_vector.c
@@ -247,7 +247,7 @@ hypre_SStructPVectorSetBoxValues( hypre_SStructPVector *pvector,
 
    /* TODO: Why need DeviceSync? */
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #endif
    /* set (AddTo/Get) or clear (Set) values outside the grid in ghost zones */
    if (action != 0)
diff --git a/src/test/ij.c b/src/test/ij.c
index 26640554c7..a3dcfc76b3 100644
--- a/src/test/ij.c
+++ b/src/test/ij.c
@@ -3406,7 +3406,7 @@ main( hypre_int argc,
       }
 
 #if defined(HYPRE_USING_GPU)
-      hypre_SyncCudaDevice(hypre_handle());
+      hypre_SyncDevice(hypre_handle());
 #endif
 
       hypre_EndTiming(time_index);
@@ -3766,7 +3766,7 @@ main( hypre_int argc,
 #endif
 
 #if defined(HYPRE_USING_GPU)
-      hypre_SyncCudaDevice(hypre_handle());
+      hypre_SyncDevice(hypre_handle());
 #endif
 
       hypre_EndTiming(time_index);
@@ -3804,7 +3804,7 @@ main( hypre_int argc,
 #endif
 
 #if defined(HYPRE_USING_GPU)
-      hypre_SyncCudaDevice(hypre_handle());
+      hypre_SyncDevice(hypre_handle());
 #endif
 
       hypre_EndTiming(time_index);
@@ -3865,7 +3865,7 @@ main( hypre_int argc,
 #endif
 
 #if defined(HYPRE_USING_GPU)
-      hypre_SyncCudaDevice(hypre_handle());
+      hypre_SyncDevice(hypre_handle());
 #endif
 
       tt = hypre_MPI_Wtime() - tt;
@@ -3897,7 +3897,7 @@ main( hypre_int argc,
 #endif
 
 #if defined(HYPRE_USING_GPU)
-      hypre_SyncCudaDevice(hypre_handle());
+      hypre_SyncDevice(hypre_handle());
 #endif
 
       tt = hypre_MPI_Wtime() - tt;
diff --git a/src/test/ij_assembly.c b/src/test/ij_assembly.c
index bb17d32803..fb28c9ba55 100644
--- a/src/test/ij_assembly.c
+++ b/src/test/ij_assembly.c
@@ -678,7 +678,7 @@ test_Set(MPI_Comm             comm,
    chunk_size = nrows / nchunks;
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStart();
 #endif
@@ -707,7 +707,7 @@ test_Set(MPI_Comm             comm,
    HYPRE_IJMatrixAssemble(ij_A);
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStop();
 #endif
@@ -831,7 +831,7 @@ test_SetOffProc(HYPRE_ParCSRMatrix    parcsr_A,
    chunk_size = nrows / nchunks;
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #endif
 
    time_index = hypre_InitializeTiming("Test SetValues OffProc");
@@ -862,7 +862,7 @@ test_SetOffProc(HYPRE_ParCSRMatrix    parcsr_A,
    //cudaProfilerStop();
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #endif
 
    hypre_EndTiming(time_index);
@@ -945,7 +945,7 @@ test_SetSet(MPI_Comm             comm,
 #endif
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStart();
 #endif
@@ -996,7 +996,7 @@ test_SetSet(MPI_Comm             comm,
    HYPRE_IJMatrixAssemble(ij_A);
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStop();
 #endif
@@ -1072,7 +1072,7 @@ test_AddSet(MPI_Comm             comm,
 #endif
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStart();
 #endif
@@ -1120,7 +1120,7 @@ test_AddSet(MPI_Comm             comm,
    HYPRE_IJMatrixAssemble(ij_A);
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStop();
 #endif
@@ -1178,7 +1178,7 @@ test_SetAddSet(MPI_Comm             comm,
    chunk_size = nrows / nchunks;
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStart();
 #endif
@@ -1244,7 +1244,7 @@ test_SetAddSet(MPI_Comm             comm,
    HYPRE_IJMatrixAssemble(ij_A);
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStop();
 #endif
diff --git a/src/test/ij_mm.c b/src/test/ij_mm.c
index 4bbf24fc39..807e9b1630 100644
--- a/src/test/ij_mm.c
+++ b/src/test/ij_mm.c
@@ -161,7 +161,7 @@ void runjob1( HYPRE_ParCSRMatrix parcsr_A,
 
       if (i == rep - 1)
       {
-         hypre_SyncCudaDevice(hypre_handle());
+         hypre_SyncDevice(hypre_handle());
          //cudaProfilerStop();
          hypre_EndTiming(time_index);
          hypre_PrintTiming("Device Parcsr Matrix-by-Matrix, A*A", hypre_MPI_COMM_WORLD);
@@ -350,7 +350,7 @@ void runjob2( HYPRE_ParCSRMatrix parcsr_A,
 
       if (i == 1)
       {
-         hypre_SyncCudaDevice(hypre_handle());
+         hypre_SyncDevice(hypre_handle());
          //cudaProfilerStop();
          hypre_EndTiming(time_index);
          hypre_PrintTiming("Device Parcsr Matrix-by-Matrix, RAP2", hypre_MPI_COMM_WORLD);
@@ -452,7 +452,7 @@ main( hypre_int argc,
    HYPRE_Init();
 
    /* for timing, sync after kernels */
-   hypre_SetSyncCudaCompute(1);
+   hypre_SetSyncDeviceCompute(1);
 
 #if defined(HYPRE_USING_CUDA)
    hypre_HandleDefaultExecPolicy(hypre_handle()) = HYPRE_EXEC_DEVICE;
diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h
index 0df44e6bea..d32c9a0c79 100644
--- a/src/utilities/_hypre_utilities.h
+++ b/src/utilities/_hypre_utilities.h
@@ -1740,8 +1740,8 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR
                                            hypre_UnorderedBigIntMap *inverse_map);
 
 #if defined(HYPRE_USING_GPU)
-HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
-HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle);
+HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle);
+HYPRE_Int hypre_SyncDevice(hypre_Handle *hypre_handle);
 HYPRE_Int hypre_ResetCudaDevice(hypre_Handle *hypre_handle);
 HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
                                       HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y);
@@ -1772,10 +1772,10 @@ HYPRE_Int hypre_multmod(HYPRE_Int a, HYPRE_Int b, HYPRE_Int mod);
 void hypre_partition1D(HYPRE_Int n, HYPRE_Int p, HYPRE_Int j, HYPRE_Int *s, HYPRE_Int *e);
 char *hypre_strcpy(char *destination, const char *source);
 
-HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action);
-HYPRE_Int hypre_RestoreSyncCudaCompute();
-HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr);
-HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
+HYPRE_Int hypre_SetSyncDeviceCompute(HYPRE_Int action);
+HYPRE_Int hypre_RestoreSyncDeviceCompute();
+HYPRE_Int hypre_GetSyncDeviceCompute(HYPRE_Int *device_compute_stream_sync_ptr);
+HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle);
 
 /* handle.c */
 HYPRE_Int hypre_SetSpGemmUseCusparse( HYPRE_Int use_cusparse );
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index 38edeac91d..e93a2b55c6 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -443,9 +443,9 @@ using namespace thrust::placeholders;
 
 #if defined(HYPRE_DEBUG)
 #if defined(HYPRE_USING_CUDA)
-#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
+#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
 #elif defined(HYPRE_USING_HIP)
-#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() );  }
+#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() );  }
 #endif
 #else // #if defined(HYPRE_DEBUG)
 #define GPU_LAUNCH_SYNC
@@ -1087,7 +1087,7 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
 
 #if defined(HYPRE_DEBUG)
 #if defined(HYPRE_USING_CUDA)
-#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
+#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
 #endif
 #else // #if defined(HYPRE_DEBUG)
 #define GPU_LAUNCH_SYNC
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index 946a75c4be..d0594c7166 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -84,7 +84,7 @@ void hypre_CudaCompileFlagCheck()
    HYPRE_CUDA_CALL( cudaMalloc(&cuda_arch_compile_d, sizeof(hypre_int)) );
    hypre_TMemcpy(cuda_arch_compile_d, &cuda_arch_compile, hypre_int, 1, HYPRE_MEMORY_DEVICE,
                  HYPRE_MEMORY_HOST);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d );
    hypre_TMemcpy(&cuda_arch_compile, cuda_arch_compile_d, hypre_int, 1, HYPRE_MEMORY_HOST,
                  HYPRE_MEMORY_DEVICE);
    //hypre_TFree(cuda_arch_compile_d, HYPRE_MEMORY_DEVICE);
@@ -1373,7 +1373,7 @@ hypre_DeviceDataDestroy(hypre_DeviceData *data)
 }
 
 HYPRE_Int
-hypre_SyncCudaDevice(hypre_Handle *hypre_handle)
+hypre_SyncDevice(hypre_Handle *hypre_handle)
 {
 #if defined(HYPRE_USING_DEVICE_OPENMP)
    HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
@@ -1381,6 +1381,8 @@ hypre_SyncCudaDevice(hypre_Handle *hypre_handle)
    HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
 #elif defined(HYPRE_USING_HIP)
    HYPRE_HIP_CALL( hipDeviceSynchronize() );
+#elif defined(HYPRE_USING_SYCL)
+   HYPRE_SYCL_CALL( hypre_HandleComputeStream(hypre_handle)->wait_and_throw() );
 #endif
    return hypre_error_flag;
 }
@@ -1400,55 +1402,57 @@ hypre_ResetCudaDevice(hypre_Handle *hypre_handle)
  * action: 0: set sync stream to false
  *         1: set sync stream to true
  *         2: restore sync stream to default
- *         3: return the current value of cuda_compute_stream_sync
- *         4: sync stream based on cuda_compute_stream_sync
+ *         3: return the current value of device_compute_stream_sync
+ *         4: sync stream based on device_compute_stream_sync
  */
 HYPRE_Int
-hypre_SyncCudaComputeStream_core(HYPRE_Int     action,
-                                 hypre_Handle *hypre_handle,
-                                 HYPRE_Int    *cuda_compute_stream_sync_ptr)
+hypre_SyncDeviceComputeStream_core(HYPRE_Int     action,
+                                   hypre_Handle *hypre_handle,
+                                   HYPRE_Int    *device_compute_stream_sync_ptr)
 {
    /* with UVM the default is to sync at kernel completions, since host is also able to
     * touch GPU memory */
 #if defined(HYPRE_USING_UNIFIED_MEMORY)
-   static const HYPRE_Int cuda_compute_stream_sync_default = 1;
+   static const HYPRE_Int device_compute_stream_sync_default = 1;
 #else
-   static const HYPRE_Int cuda_compute_stream_sync_default = 0;
+   static const HYPRE_Int device_compute_stream_sync_default = 0;
 #endif
 
    /* this controls if synchronize the stream after computations */
-   static HYPRE_Int cuda_compute_stream_sync = cuda_compute_stream_sync_default;
+   static HYPRE_Int device_compute_stream_sync = device_compute_stream_sync_default;
 
    switch (action)
    {
       case 0:
-         cuda_compute_stream_sync = 0;
+         device_compute_stream_sync = 0;
          break;
       case 1:
-         cuda_compute_stream_sync = 1;
+         device_compute_stream_sync = 1;
          break;
       case 2:
-         cuda_compute_stream_sync = cuda_compute_stream_sync_default;
+         device_compute_stream_sync = device_compute_stream_sync_default;
          break;
       case 3:
-         *cuda_compute_stream_sync_ptr = cuda_compute_stream_sync;
+         *device_compute_stream_sync_ptr = device_compute_stream_sync;
          break;
       case 4:
 #if defined(HYPRE_USING_DEVICE_OPENMP)
          HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
 #else
-         if (cuda_compute_stream_sync)
+         if (device_compute_stream_sync)
          {
 #if defined(HYPRE_USING_CUDA)
             HYPRE_CUDA_CALL( cudaStreamSynchronize(hypre_HandleComputeStream(hypre_handle)) );
 #elif defined(HYPRE_USING_HIP)
             HYPRE_HIP_CALL( hipStreamSynchronize(hypre_HandleComputeStream(hypre_handle)) );
+#elif defined(HYPRE_USING_SYCL)
+            HYPRE_SYCL_CALL( hypre_HandleComputeStream(hypre_handle)->ext_oneapi_submit_barrier() );
 #endif
          }
 #endif
          break;
       default:
-         hypre_printf("hypre_SyncCudaComputeStream_core invalid action\n");
+         hypre_printf("hypre_SyncDeviceComputeStream_core invalid action\n");
          hypre_error_in_arg(1);
    }
 
@@ -1456,35 +1460,35 @@ hypre_SyncCudaComputeStream_core(HYPRE_Int     action,
 }
 
 HYPRE_Int
-hypre_SetSyncCudaCompute(HYPRE_Int action)
+hypre_SetSyncDeviceCompute(HYPRE_Int action)
 {
    /* convert to 1/0 */
    action = action != 0;
-   hypre_SyncCudaComputeStream_core(action, NULL, NULL);
+   hypre_SyncDeviceComputeStream_core(action, NULL, NULL);
 
    return hypre_error_flag;
 }
 
 HYPRE_Int
-hypre_RestoreSyncCudaCompute()
+hypre_RestoreSyncDeviceCompute()
 {
-   hypre_SyncCudaComputeStream_core(2, NULL, NULL);
+   hypre_SyncDeviceComputeStream_core(2, NULL, NULL);
 
    return hypre_error_flag;
 }
 
 HYPRE_Int
-hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr)
+hypre_GetSyncDeviceCompute(HYPRE_Int *device_compute_stream_sync_ptr)
 {
-   hypre_SyncCudaComputeStream_core(3, NULL, cuda_compute_stream_sync_ptr);
+   hypre_SyncDeviceComputeStream_core(3, NULL, device_compute_stream_sync_ptr);
 
    return hypre_error_flag;
 }
 
 HYPRE_Int
-hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle)
+hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle)
 {
-   hypre_SyncCudaComputeStream_core(4, hypre_handle, NULL);
+   hypre_SyncDeviceComputeStream_core(4, hypre_handle, NULL);
 
    return hypre_error_flag;
 }
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index 4bddafa330..59549ca6db 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -386,9 +386,9 @@ using namespace thrust::placeholders;
 
 #if defined(HYPRE_DEBUG)
 #if defined(HYPRE_USING_CUDA)
-#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
+#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
 #elif defined(HYPRE_USING_HIP)
-#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() );  }
+#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() );  }
 #endif
 #else // #if defined(HYPRE_DEBUG)
 #define GPU_LAUNCH_SYNC
@@ -1030,7 +1030,7 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
 
 #if defined(HYPRE_DEBUG)
 #if defined(HYPRE_USING_CUDA)
-#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
+#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
 #endif
 #else // #if defined(HYPRE_DEBUG)
 #define GPU_LAUNCH_SYNC
diff --git a/src/utilities/int_array.c b/src/utilities/int_array.c
index 7a51fbb80d..65ea3f5ef9 100644
--- a/src/utilities/int_array.c
+++ b/src/utilities/int_array.c
@@ -168,7 +168,7 @@ hypre_IntArraySetConstantValues( hypre_IntArray *v,
 #endif /* defined(HYPRE_USING_CUDA)  || defined(HYPRE_USING_HIP) */
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
    return ierr;
diff --git a/src/utilities/protos.h b/src/utilities/protos.h
index eb41f99847..ad3b5ff8a8 100644
--- a/src/utilities/protos.h
+++ b/src/utilities/protos.h
@@ -269,8 +269,8 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR
                                            hypre_UnorderedBigIntMap *inverse_map);
 
 #if defined(HYPRE_USING_GPU)
-HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
-HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle);
+HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle);
+HYPRE_Int hypre_SyncDevice(hypre_Handle *hypre_handle);
 HYPRE_Int hypre_ResetCudaDevice(hypre_Handle *hypre_handle);
 HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
                                       HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y);
@@ -301,10 +301,10 @@ HYPRE_Int hypre_multmod(HYPRE_Int a, HYPRE_Int b, HYPRE_Int mod);
 void hypre_partition1D(HYPRE_Int n, HYPRE_Int p, HYPRE_Int j, HYPRE_Int *s, HYPRE_Int *e);
 char *hypre_strcpy(char *destination, const char *source);
 
-HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action);
-HYPRE_Int hypre_RestoreSyncCudaCompute();
-HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr);
-HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
+HYPRE_Int hypre_SetSyncDeviceCompute(HYPRE_Int action);
+HYPRE_Int hypre_RestoreSyncDeviceCompute();
+HYPRE_Int hypre_GetSyncDeviceCompute(HYPRE_Int *device_compute_stream_sync_ptr);
+HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle);
 
 /* handle.c */
 HYPRE_Int hypre_SetSpGemmUseCusparse( HYPRE_Int use_cusparse );

From 243e2b8f8fd3cb59f9088cc2359c268dd0490efa Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Thu, 9 Dec 2021 21:15:57 +0000
Subject: [PATCH 36/44] [SYCL] update, unify new functions for CUDA and SYCL in
 csr_matop_device

---
 src/seq_mv/csr_matop_device.c      | 758 +++++++++++------------------
 src/utilities/HYPRE_utilities.h    |   6 +-
 src/utilities/_hypre_utilities.h   |   5 +
 src/utilities/_hypre_utilities.hpp |  77 +--
 src/utilities/device_utils.c       |  71 ++-
 src/utilities/device_utils.h       |  45 +-
 src/utilities/memory.h             |   5 +
 7 files changed, 421 insertions(+), 546 deletions(-)

diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c
index ff79ec97f4..cd6e819515 100644
--- a/src/seq_mv/csr_matop_device.c
+++ b/src/seq_mv/csr_matop_device.c
@@ -1378,139 +1378,6 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix  *A,
   return hypre_error_flag;
 }
 
-
-HYPRE_Int
-hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
-{
-  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-  dim3           bDim, gDim;
-
-  bDim = hypre_GetDefaultDeviceBlockDimension();
-  gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
-
-  HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim,
-                   nrows, A_i, A_j, A_data);
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return hypre_error_flag;
-}
-
-HYPRE_Int
-hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
-{
-  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
-  {
-    return 0;
-  }
-
-  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
-  dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim);
-
-  HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
-  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRCheckDiagFirst, gDim, bDim,
-                    hypre_CSRMatrixNumRows(A),
-                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result );
-
-  HYPRE_Int ierr = HYPRE_THRUST_CALL( reduce,
-                                      result,
-                                      result + hypre_CSRMatrixNumRows(A) );
-
-  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return ierr;
-}
-
-/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v
- * Does NOT assume diagonal is the first entry of each row of A
- * In debug mode:
- *    Returns the number of rows that do not have diag in the pattern
- *    (i.e., structural zeroes on the diagonal)
- */
-HYPRE_Int
-hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
-                                  HYPRE_Complex    v,
-                                  HYPRE_Real       tol )
-{
-  HYPRE_Int ierr = 0;
-
-  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
-  {
-    return ierr;
-  }
-
-  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
-  dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
-
-#if HYPRE_DEBUG
-  HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
-#else
-  HYPRE_Int *result = NULL;
-#endif
-
-  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim,
-                    v, hypre_CSRMatrixNumRows(A),
-                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
-                    tol, result );
-
-#if HYPRE_DEBUG
-  ierr = HYPRE_THRUST_CALL( reduce,
-                            result,
-                            result + hypre_CSRMatrixNumRows(A) );
-
-  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
-#endif
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return ierr;
-}
-
-HYPRE_Int
-hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
-                                  HYPRE_Complex   *new_diag,
-                                  HYPRE_Complex    v,
-                                  HYPRE_Real       tol )
-{
-  HYPRE_Int ierr = 0;
-
-  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
-  {
-    return ierr;
-  }
-
-  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
-  dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
-
-#if HYPRE_DEBUG
-  HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
-#else
-  HYPRE_Int *result = NULL;
-#endif
-
-  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
-                    new_diag, v, hypre_CSRMatrixNumRows(A),
-                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
-                    tol, result );
-
-#if HYPRE_DEBUG
-  ierr = HYPRE_THRUST_CALL( reduce,
-                            result,
-                            result + hypre_CSRMatrixNumRows(A) );
-
-  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
-#endif
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return ierr;
-}
-
 typedef thrust::tuple<HYPRE_Int, HYPRE_Int> Int2;
 struct Int2Unequal : public thrust::unary_function<Int2, bool>
 {
@@ -1595,62 +1462,6 @@ hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A)
   return hypre_error_flag;
 }
 
-void
-hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
-                                    HYPRE_Int       *CF_i,
-                                    HYPRE_Int       *CF_j,
-                                    HYPRE_Complex   *row_sum,
-                                    HYPRE_Int        type,
-                                    HYPRE_Complex    scal,
-                                    const char      *set_or_add)
-{
-  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-  dim3           bDim, gDim;
-
-  bDim = hypre_GetDefaultDeviceBlockDimension();
-  gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
-
-  if (type == 0)
-  {
-    HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
-                      row_sum, scal, set_or_add[0] == 's' );
-  }
-  else if (type == 1)
-  {
-    HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
-                      row_sum, scal, set_or_add[0] == 's' );
-  }
-  else if (type == 2)
-  {
-    HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
-                      row_sum, scal, set_or_add[0] == 's' );
-  }
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-}
-
-void
-hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
-                                      HYPRE_Complex   *d,
-                                      HYPRE_Int        type)
-{
-  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-  dim3           bDim, gDim;
-
-  bDim = hypre_GetDefaultDeviceBlockDimension();
-  gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
-
-  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-}
-
 /* return C = [A; B] */
 hypre_CSRMatrix*
 hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B)
@@ -1819,49 +1630,6 @@ hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A,
   return hypre_error_flag;
 }
 
-/* markA: array of size nnz(A), for pattern of (A and B), markA is the column indices as in A_J
- * Otherwise, mark pattern not in A-B as -1 in markA
- * Note the special treatment for diagonal entries of A (marked as -2) */
-HYPRE_Int
-hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A,
-                                hypre_CSRMatrix *B,
-                                HYPRE_Int       *markA,
-                                HYPRE_Int        diag_opt)
-{
-  HYPRE_Int nrows = hypre_CSRMatrixNumRows(A);
-  HYPRE_Int nnzA  = hypre_CSRMatrixNumNonzeros(A);
-  HYPRE_Int nnzB  = hypre_CSRMatrixNumNonzeros(B);
-
-  HYPRE_Int *Cii = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
-  HYPRE_Int *Cjj = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
-  HYPRE_Int *idx = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
-
-  hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzA, hypre_CSRMatrixI(A), Cii);
-  hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA);
-  hypre_TMemcpy(Cjj,        hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  HYPRE_THRUST_CALL( sequence, idx, idx + nnzA + nnzB );
-
-  HYPRE_THRUST_CALL( stable_sort_by_key,
-                     thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)),
-                     thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)) + nnzA + nnzB,
-                     idx );
-
-  hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-
-  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
-  dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim);
-
-  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixIntersectPattern, gDim, bDim,
-                    nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt );
-
-  hypre_TFree(Cii, HYPRE_MEMORY_DEVICE);
-  hypre_TFree(Cjj, HYPRE_MEMORY_DEVICE);
-  hypre_TFree(idx, HYPRE_MEMORY_DEVICE);
-
-  return hypre_error_flag;
-}
-
 #endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */
 
 #if defined(HYPRE_USING_SYCL)
@@ -2167,7 +1935,7 @@ hypre_CSRMatrixSplitDevice_core( HYPRE_Int         job,                 /* 0: qu
   }
 
   HYPRE_BigInt *const_iterator = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-  hypre_DeviceDataComputeStream(hypre_handle())->fill(const_iterator, first_col_diag_B, B_ext_diag_nnz*sizeof(HYPRE_BigInt)).wait();
+  hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, first_col_diag_B, B_ext_diag_nnz*sizeof(HYPRE_BigInt)).wait();
   HYPRE_ONEDPL_CALL( std::transform,
                      B_ext_diag_bigj,
                      B_ext_diag_bigj + B_ext_diag_nnz,
@@ -2333,8 +2101,8 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix  *A,
   reduced_col_nnz     = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
 
   // ABB: Replace values in-place with dpct::make_constant_iterator(1)
-  HYPRE_Int* values = hypre_TAlloc(HYPRE_Int, nnz_A, hypre_MEMORY_UNIFIED);
-  hypre_DeviceDataComputeStream(hypre_handle())->fill(values, 1, nnz_A*sizeof(HYPRE_Int)).wait();
+  HYPRE_Int* values = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_UNIFIED);
+  hypre_HandleComputeStream(hypre_handle())->fill(values, 1, nnz_A*sizeof(HYPRE_Int)).wait();
   std::pair<HYPRE_Int*, HYPRE_Int*> new_end =
     HYPRE_ONEDPL_CALL( oneapi::dpl::reduce_by_segment, A_j_sorted, A_j_sorted + nnz_A,
                        values,
@@ -2359,163 +2127,30 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix  *A,
   return hypre_error_flag;
 }
 
-
 HYPRE_Int
-hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
+hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A)
 {
   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-  sycl::range<1> bDim, gDim;
-
-  bDim = hypre_GetDefaultDeviceBlockDimension();
-  gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
-
-  HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim,
-                   nrows, A_i, A_j, A_data);
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_ii   = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
+  HYPRE_Int      new_nnz;
+  HYPRE_Int     *new_ii;
+  HYPRE_Int     *new_j;
+  HYPRE_Complex *new_data;
 
-  return hypre_error_flag;
-}
+  auto zipped_begin = oneapi::dpl::make_zip_iterator(A_ii, A_j);
+  new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
+                               zipped_begin, zipped_begin + nnz,
+                               [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
 
-HYPRE_Int
-hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
-{
-  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+  if (new_nnz == nnz)
   {
-    return 0;
-  }
-
-  sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
-  sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim);
-
-  HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
-  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRCheckDiagFirst, gDim, bDim,
-                    hypre_CSRMatrixNumRows(A),
-                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result );
-
-  HYPRE_Int ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
-                                      result,
-                                      result + hypre_CSRMatrixNumRows(A) );
-
-  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return ierr;
-}
-
-/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v
- * Does NOT assume diagonal is the first entry of each row of A
- * In debug mode:
- *    Returns the number of rows that do not have diag in the pattern
- *    (i.e., structural zeroes on the diagonal)
- */
-HYPRE_Int
-hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
-                                  HYPRE_Complex    v,
-                                  HYPRE_Real       tol )
-{
-  HYPRE_Int ierr = 0;
-
-  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
-  {
-    return ierr;
-  }
-
-  sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
-  sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
-
-#if HYPRE_DEBUG
-  HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
-#else
-  HYPRE_Int *result = NULL;
-#endif
-
-  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim,
-                    v, hypre_CSRMatrixNumRows(A),
-                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
-                    tol, result );
-
-#if HYPRE_DEBUG
-  ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
-                            result,
-                            result + hypre_CSRMatrixNumRows(A) );
-
-  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
-#endif
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return ierr;
-}
-
-HYPRE_Int
-hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
-                                  HYPRE_Complex   *new_diag,
-                                  HYPRE_Complex    v,
-                                  HYPRE_Real       tol )
-{
-  HYPRE_Int ierr = 0;
-
-  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
-  {
-    return ierr;
-  }
-
-  sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
-  sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
-
-#if HYPRE_DEBUG
-  HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
-#else
-  HYPRE_Int *result = NULL;
-#endif
-
-  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
-                    new_diag, v, hypre_CSRMatrixNumRows(A),
-                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
-                    tol, result );
-
-#if HYPRE_DEBUG
-  ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
-                            result,
-                            result + hypre_CSRMatrixNumRows(A) );
-
-  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
-#endif
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return ierr;
-}
-
-HYPRE_Int
-hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A)
-{
-  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
-  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-  HYPRE_Int     *A_ii   = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
-  HYPRE_Int      new_nnz;
-  HYPRE_Int     *new_ii;
-  HYPRE_Int     *new_j;
-  HYPRE_Complex *new_data;
-
-  auto zipped_begin = oneapi::dpl::make_zip_iterator(A_ii, A_j);
-  new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
-                               zipped_begin, zipped_begin + nnz,
-                               [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
-
-  if (new_nnz == nnz)
-  {
-    /* no diagonal entries found */
-    hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
-    return hypre_error_flag;
+    /* no diagonal entries found */
+    hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
+    return hypre_error_flag;
   }
 
   new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
@@ -2564,62 +2199,6 @@ hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A)
   return hypre_error_flag;
 }
 
-void
-hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
-                                    HYPRE_Int       *CF_i,
-                                    HYPRE_Int       *CF_j,
-                                    HYPRE_Complex   *row_sum,
-                                    HYPRE_Int        type,
-                                    HYPRE_Complex    scal,
-                                    const char      *set_or_add)
-{
-  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-  sycl::range<1>           bDim, gDim;
-
-  bDim = hypre_GetDefaultDeviceBlockDimension();
-  gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
-
-  if (type == 0)
-  {
-    HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
-                      row_sum, scal, set_or_add[0] == 's' );
-  }
-  else if (type == 1)
-  {
-    HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
-                      row_sum, scal, set_or_add[0] == 's' );
-  }
-  else if (type == 2)
-  {
-    HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
-                      row_sum, scal, set_or_add[0] == 's' );
-  }
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-}
-
-void
-hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
-                                      HYPRE_Complex   *d,
-                                      HYPRE_Int        type)
-{
-  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-  sycl::range<1>           bDim, gDim;
-
-  bDim = hypre_GetDefaultDeviceBlockDimension();
-  gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
-
-  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-}
-
 /* return C = [A; B] */
 hypre_CSRMatrix*
 hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B)
@@ -2640,7 +2219,7 @@ hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B)
                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
 
   HYPRE_Int *const_iterator = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE);
-  hypre_DeviceDataComputeStream(hypre_handle())->fill(const_iterator, hypre_CSRMatrixNumNonzeros(A), (hypre_CSRMatrixNumRows(C) + 1)*sizeof(HYPRE_Int)).wait();
+  hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, hypre_CSRMatrixNumNonzeros(A), (hypre_CSRMatrixNumRows(C) + 1)*sizeof(HYPRE_Int)).wait();
   HYPRE_ONEDPL_CALL( std::transform,
                      C_i + hypre_CSRMatrixNumRows(A) + 1,
                      C_i + hypre_CSRMatrixNumRows(C) + 1,
@@ -2782,6 +2361,158 @@ hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A,
   return hypre_error_flag;
 }
 
+#endif /* HYPRE_USING_SYCL */
+
+
+#if defined(HYPRE_USING_GPU)
+
+/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v
+ * Does NOT assume diagonal is the first entry of each row of A
+ * In debug mode:
+ *    Returns the number of rows that do not have diag in the pattern
+ *    (i.e., structural zeroes on the diagonal)
+ */
+HYPRE_Int
+hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
+                                  HYPRE_Complex    v,
+                                  HYPRE_Real       tol )
+{
+  HYPRE_Int ierr = 0;
+
+  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+  {
+    return ierr;
+  }
+
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
+
+#if HYPRE_DEBUG
+  HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+#else
+  HYPRE_Int *result = NULL;
+#endif
+
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim,
+                    v, hypre_CSRMatrixNumRows(A),
+                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
+                    tol, result );
+
+#if HYPRE_DEBUG
+#if defined(HYPRE_USING_CUDA)
+  ierr = HYPRE_THRUST_CALL( reduce,
+                            result,
+                            result + hypre_CSRMatrixNumRows(A) );
+#elif defined(HYPRE_USING_SYCL)
+  ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
+                            result,
+                            result + hypre_CSRMatrixNumRows(A) );
+#endif
+  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+#endif // HYPRE_DEBUG
+
+  hypre_SyncDeviceComputeStream(hypre_handle());
+
+  return ierr;
+}
+
+HYPRE_Int
+hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
+                                  HYPRE_Complex   *new_diag,
+                                  HYPRE_Complex    v,
+                                  HYPRE_Real       tol )
+{
+  HYPRE_Int ierr = 0;
+
+  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+  {
+    return ierr;
+  }
+
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
+
+#if HYPRE_DEBUG
+  HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+#else
+  HYPRE_Int *result = NULL;
+#endif
+
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
+                    new_diag, v, hypre_CSRMatrixNumRows(A),
+                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
+                    tol, result );
+
+#if HYPRE_DEBUG
+#if defined(HYPRE_USING_CUDA)
+  ierr = HYPRE_THRUST_CALL( reduce,
+                            result,
+                            result + hypre_CSRMatrixNumRows(A) );
+#elif defined(HYPRE_USING_SYCL)
+  ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
+                            result,
+                            result + hypre_CSRMatrixNumRows(A) );
+#endif
+  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+#endif // HYPRE_DEBUG
+
+  hypre_SyncDeviceComputeStream(hypre_handle());
+
+  return ierr;
+}
+
+HYPRE_Int
+hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
+{
+  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+  {
+    return 0;
+  }
+
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim);
+
+  HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRCheckDiagFirst, gDim, bDim,
+                    hypre_CSRMatrixNumRows(A),
+                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result );
+
+#if defined(HYPRE_USING_CUDA)
+  HYPRE_Int ierr = HYPRE_THRUST_CALL( reduce,
+                                      result,
+                                      result + hypre_CSRMatrixNumRows(A) );
+#elif defined(HYPRE_USING_SYCL)
+  HYPRE_Int ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
+                                      result,
+                                      result + hypre_CSRMatrixNumRows(A) );
+#endif
+
+  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+
+  hypre_SyncDeviceComputeStream(hypre_handle());
+
+  return ierr;
+}
+
+HYPRE_Int
+hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
+{
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
+
+  HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim,
+                   nrows, A_i, A_j, A_data);
+
+  hypre_SyncDeviceComputeStream(hypre_handle());
+
+  return hypre_error_flag;
+}
+
 /* markA: array of size nnz(A), for pattern of (A and B), markA is the column indices as in A_J
  * Otherwise, mark pattern not in A-B as -1 in markA
  * Note the special treatment for diagonal entries of A (marked as -2) */
@@ -2803,17 +2534,26 @@ hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A,
   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA);
   hypre_TMemcpy(Cjj,        hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
   hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+#if defined(HYPRE_USING_CUDA)
+  HYPRE_THRUST_CALL( sequence, idx, idx + nnzA + nnzB );
+
+  HYPRE_THRUST_CALL( stable_sort_by_key,
+                     thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)),
+                     thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)) + nnzA + nnzB,
+                     idx );
+#elif defined(HYPRE_USING_SYCL)
   HYPRE_ONEDPL_CALL( dpct::iota, idx, idx + nnzA + nnzB, 0 );
 
-  auto keys_begin = oneapi::dpl::make_zip_iterator(Cii, Cjj);
-  auto zipped_begin = oneapi::dpl::make_zip_iterator(keys_begin, idx);
+  auto zipped_begin = oneapi::dpl::make_zip_iterator(Cii, Cjj, idx);
   HYPRE_ONEDPL_CALL( std::stable_sort, zipped_begin, zipped_begin + nnzA + nnzB,
                      [](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); } );
+#endif
 
   hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
 
-  sycl::range<1> bDim = hypre_GetDefaultDeviceBlockDimension();
-  sycl::range<1> gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim);
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim);
 
   HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixIntersectPattern, gDim, bDim,
                     nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt );
@@ -2825,62 +2565,112 @@ hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A,
   return hypre_error_flag;
 }
 
-#endif /* HYPRE_USING_SYCL */
+void
+hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
+                                      HYPRE_Complex   *d,
+                                      HYPRE_Int        type)
+{
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
 
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
+
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
+
+  hypre_SyncDeviceComputeStream(hypre_handle());
+}
+
+void
+hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
+                                    HYPRE_Int       *CF_i,
+                                    HYPRE_Int       *CF_j,
+                                    HYPRE_Complex   *row_sum,
+                                    HYPRE_Int        type,
+                                    HYPRE_Complex    scal,
+                                    const char      *set_or_add)
+{
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
+
+  if (type == 0)
+  {
+    HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
+                      row_sum, scal, set_or_add[0] == 's' );
+  }
+  else if (type == 1)
+  {
+    HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
+                      row_sum, scal, set_or_add[0] == 's' );
+  }
+  else if (type == 2)
+  {
+    HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
+                      row_sum, scal, set_or_add[0] == 's' );
+  }
+
+  hypre_SyncDeviceComputeStream(hypre_handle());
+}
 
-#if defined(HYPRE_USING_GPU)
 
 HYPRE_Int
 hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix  *A,
                                hypre_CSRMatrix **AT_ptr,
                                HYPRE_Int         data)
 {
-   HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
-   HYPRE_Int        *A_i      = hypre_CSRMatrixI(A);
-   HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
-   HYPRE_Int         nrows_A  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
-   HYPRE_Int         nnz_A    = hypre_CSRMatrixNumNonzeros(A);
-   HYPRE_Complex    *C_data;
-   HYPRE_Int        *C_i;
-   HYPRE_Int        *C_j;
-   hypre_CSRMatrix  *C;
-
-
-   /* trivial case */
-   if (nnz_A == 0)
-   {
-      C_i =    hypre_CTAlloc(HYPRE_Int,     ncols_A + 1, HYPRE_MEMORY_DEVICE);
-      C_j =    hypre_CTAlloc(HYPRE_Int,     0,           HYPRE_MEMORY_DEVICE);
-      C_data = hypre_CTAlloc(HYPRE_Complex, 0,           HYPRE_MEMORY_DEVICE);
-   }
-   else
-   {
+  HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
+  HYPRE_Int        *A_i      = hypre_CSRMatrixI(A);
+  HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
+  HYPRE_Int         nrows_A  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
+  HYPRE_Int         nnz_A    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Complex    *C_data;
+  HYPRE_Int        *C_i;
+  HYPRE_Int        *C_j;
+  hypre_CSRMatrix  *C;
+
+
+  /* trivial case */
+  if (nnz_A == 0)
+  {
+    C_i =    hypre_CTAlloc(HYPRE_Int,     ncols_A + 1, HYPRE_MEMORY_DEVICE);
+    C_j =    hypre_CTAlloc(HYPRE_Int,     0,           HYPRE_MEMORY_DEVICE);
+    C_data = hypre_CTAlloc(HYPRE_Complex, 0,           HYPRE_MEMORY_DEVICE);
+  }
+  else
+  {
 #if defined(HYPRE_USING_CUSPARSE)
-      hypreDevice_CSRSpTransCusparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
-                                     data);
+    hypreDevice_CSRSpTransCusparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
+                                   data);
 #elif defined(HYPRE_USING_ROCSPARSE)
-      hypreDevice_CSRSpTransRocsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
-                                      data);
+    hypreDevice_CSRSpTransRocsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
+                                    data);
 #elif defined(HYPRE_USING_ONEMKLSPARSE)
-      hypreDevice_CSRSpTransOnemklsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
-                                         data);
+    hypreDevice_CSRSpTransOnemklsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
+                                       data);
 #else
-      hypreDevice_CSRSpTrans(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data);
+    hypreDevice_CSRSpTrans(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data);
 #endif
-   }
+  }
 
-   C = hypre_CSRMatrixCreate(ncols_A, nrows_A, nnz_A);
-   hypre_CSRMatrixI(C) = C_i;
-   hypre_CSRMatrixJ(C) = C_j;
-   hypre_CSRMatrixData(C) = C_data;
-   hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
+  C = hypre_CSRMatrixCreate(ncols_A, nrows_A, nnz_A);
+  hypre_CSRMatrixI(C) = C_i;
+  hypre_CSRMatrixJ(C) = C_j;
+  hypre_CSRMatrixData(C) = C_data;
+  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
 
-   *AT_ptr = C;
+  *AT_ptr = C;
 
-   hypre_SyncDeviceComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
-   return hypre_error_flag;
+  return hypre_error_flag;
 }
 
 #endif /* #if defined(HYPRE_USING_GPU) */
diff --git a/src/utilities/HYPRE_utilities.h b/src/utilities/HYPRE_utilities.h
index 14fe32b136..f8bbb154f8 100644
--- a/src/utilities/HYPRE_utilities.h
+++ b/src/utilities/HYPRE_utilities.h
@@ -185,11 +185,15 @@ HYPRE_Int HYPRE_AssumedPartitionCheck();
  * HYPRE memory location
  *--------------------------------------------------------------------------*/
 
+// ABB: HYPRE_MEMORY_UNIFIED for the case of allocating SHARED memory
+//      specifically at some locations and everywhere as can be enabled
+//      with HYPRE_USING_UNIFIED_MEMORY macro
 typedef enum _HYPRE_MemoryLocation
 {
    HYPRE_MEMORY_UNDEFINED = -1,
    HYPRE_MEMORY_HOST,
-   HYPRE_MEMORY_DEVICE
+   HYPRE_MEMORY_DEVICE,
+   HYPRE_MEMORY_UNIFIED
 } HYPRE_MemoryLocation;
 
 HYPRE_Int HYPRE_SetMemoryLocation(HYPRE_MemoryLocation memory_location);
diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h
index d32c9a0c79..d26bf1927b 100644
--- a/src/utilities/_hypre_utilities.h
+++ b/src/utilities/_hypre_utilities.h
@@ -635,6 +635,11 @@ hypre_GetActualMemLocation(HYPRE_MemoryLocation location)
 #endif
    }
 
+   if (location == HYPRE_MEMORY_UNIFIED)
+   {
+      return hypre_MEMORY_UNIFIED;
+   }
+
    return hypre_MEMORY_UNDEFINED;
 }
 
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index e93a2b55c6..ee4ece0d96 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -110,6 +110,8 @@ struct hypre_device_allocator
 
 #elif defined(HYPRE_USING_SYCL)
 
+typedef sycl::range<1> dim3;
+
 /* WM: problems with this being inside extern C++ {} */
 /* #include <CL/sycl.hpp> */
 
@@ -392,17 +394,22 @@ struct hypre_GpuMatData
 #define hypre_GpuMatDataMatInfo(data)     ((data) -> mat_info)
 #define hypre_GpuMatDataSpMVBuffer(data)  ((data) -> spmv_buffer)
 
+/* device_utils.c, some common functions for CUDA, SYCL, HIP */
+
+dim3 hypre_GetDefaultDeviceBlockDimension();
+
+dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity,
+					  dim3 bDim );
+
+HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
+                                             HYPRE_Int *d_row_ind);
+
 #endif //#if defined(HYPRE_USING_GPU)
 
 #if defined(HYPRE_USING_SYCL)
 
 /* device_utils.c */
 HYPRE_Int HYPRE_SetSYCLDevice(sycl::device user_device);
-sycl::range<1> hypre_GetDefaultDeviceBlockDimension();
-
-sycl::range<1> hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity,
-                                                    sycl::range<1> bDim );
-
 #endif // #if defined(HYPRE_USING_SYCL)
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
@@ -1025,9 +1032,6 @@ HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
 
 HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
 
-HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
-                                             HYPRE_Int *d_row_ind);
-
 HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
 
 HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
@@ -1062,20 +1066,19 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
 
 #if defined(HYPRE_USING_SYCL)
 
-#define PSTL_USE_PARALLEL_POLICIES 0 // for libstdc++ 9
-#define _GLIBCXX_USE_TBB_PAR_BACKEND 0 // for libstdc++ 10
+#pragma once
 
-// #include <oneapi/dpl/execution>
-// #include <oneapi/dpl/algorithm>
-// #include <oneapi/dpl/iterator>
-// #include <oneapi/dpl/functional>
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/iterator>
+#include <oneapi/dpl/functional>
 
-//#include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if
+#include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if, scatter_if
 
-// #include <algorithm>
-// #include <numeric>
-// #include <functional>
-// #include <iterator>
+#include <algorithm>
+#include <numeric>
+#include <functional>
+#include <iterator>
 
 #define __forceinline__ __inline__ __attribute__((always_inline))
 
@@ -1104,7 +1107,7 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
    }                                                                                         \
    else                                                                                      \
    {                                                                                         \
-     hypre_DeviceDataComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \
+     hypre_HandleComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \
         [=] (sycl::nd_item<1> item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]] {        \
            (kernel_name)(item, __VA_ARGS__);                                                 \
      });                                                                                     \
@@ -1115,7 +1118,7 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
  * The following one works OK for now */
 
 #define HYPRE_ONEDPL_CALL(func_name, ...) \
-  func_name(oneapi::dpl::execution::make_device_policy(*hypre_DeviceDataComputeStream(hypre_handle()), __VA_ARGS__);
+  func_name(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
 
 // /* return the number of threads in block */
 // template <hypre_int dim>
@@ -1431,16 +1434,12 @@ T warp_reduce_sum(T in, sycl::nd_item<1>& item)
 // };
 
 template<typename T>
-struct in_range : public std::unary_function<T, bool>
+struct in_range
 {
    T low, up;
-
    in_range(T low_, T up_) { low = low_; up = up_; }
 
-   bool operator()(const T &x) const 
-   {
-      return (x >= low && x <= up);
-   }
+   bool operator()(const T &x) const { return (x >= low && x <= up); }
 };
 
 // template<typename T>
@@ -1456,15 +1455,25 @@ struct in_range : public std::unary_function<T, bool>
 //    }
 // };
 
-template<typename T>
-struct less_than : std::unary_function<T, bool>
+#ifdef HYPRE_COMPLEX
+template<typename T,
+	 typename = typename std::enable_if<std::is_same<T, HYPRE_Complex>::value>::type>
+struct less_than
 {
-   T val;
-   less_than(T val_) { val = val_; }
-   
-   bool operator()(const T &x) const { return (x < val); }
+  T val;
+  less_than(T val_) { val = val_; }
+  bool operator()(const T &x) const { return (hypre_abs(x) < hypre_abs(val)); }
 };
-
+#else
+template<typename T,
+	 typename = typename std::enable_if<std::is_same<T, HYPRE_Real>::value>::type>
+struct less_than
+{
+  T val;
+  less_than(T val_) { val = val_; }
+  bool operator()(const T &x) const { return (x < val); }
+};
+#endif
 // template<typename T>
 // struct modulo : public thrust::unary_function<T,T>
 // {
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index d0594c7166..d66c9a500d 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -9,15 +9,15 @@
 #include "_hypre_utilities.hpp"
 
 #if defined(HYPRE_USING_SYCL)
-sycl::range<1> hypre_GetDefaultDeviceBlockDimension()
+dim3 hypre_GetDefaultDeviceBlockDimension()
 {
    sycl::range<1> wgDim(hypre_HandleDeviceMaxWorkGroupSize(hypre_handle()));
    return wgDim;
 }
 
-sycl::range<1> hypre_GetDefaultDeviceGridDimension(HYPRE_Int n,
-                                                   const char *granularity,
-                                                   sycl::range<1> wgDim)
+dim3 hypre_GetDefaultDeviceGridDimension(HYPRE_Int n,
+					 const char *granularity,
+					 sycl::range<1> wgDim)
 {
    HYPRE_Int num_WGs = 0;
    HYPRE_Int num_workitems_per_WG = wgDim[0];
@@ -42,7 +42,67 @@ sycl::range<1> hypre_GetDefaultDeviceGridDimension(HYPRE_Int n,
 
    return gDim;
 }
-#endif
+
+
+// /**
+//  * Get NNZ of each row in d_row_indices and stored the results in d_rownnz
+//  * All pointers are device pointers.
+//  * d_rownnz can be the same as d_row_indices
+//  */
+// void
+// hypreCUDAKernel_GetRowNnz(sycl::nd_item<1>& item,
+// 			  HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia,
+//                           HYPRE_Int *d_offd_ia,
+//                           HYPRE_Int *d_rownnz)
+// {
+//   const HYPRE_Int global_thread_id = hypre_sycl_get_grid_thread_id<1, 1>(item);
+
+//   if (global_thread_id < nrows)
+//   {
+//     HYPRE_Int i;
+
+//     if (d_row_indices)
+//     {
+//       i = read_only_load(&d_row_indices[global_thread_id]);
+//     }
+//     else
+//     {
+//       i = global_thread_id;
+//     }
+
+//     d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i + 1]) - read_only_load(&d_diag_ia[i]) +
+//       read_only_load(&d_offd_ia[i + 1]) - read_only_load(&d_offd_ia[i]);
+//   }
+// }
+
+HYPRE_Int
+hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
+                                   HYPRE_Int *d_row_ind)
+{
+   /* trivial case */
+   if (nrows <= 0 || nnz <= 0)
+   {
+      return hypre_error_flag;
+   }
+
+   HYPRE_ONEDPL_CALL( std::fill, d_row_ind, d_row_ind + nnz, 0 );
+
+   // TODO: need to fix this by passing a "predicate" as last argument
+   HYPRE_ONEDPL_CALL( dpct::scatter_if,
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(0),
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(nrows),
+                      d_row_ptr,
+                      oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1),
+                                                            [](auto t) { return std::get<0>(t) != std::get<1>(t); } ),
+                      d_row_ind );
+
+   HYPRE_ONEDPL_CALL( std::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind,
+                      sycl::maximum<HYPRE_Int>() );
+
+   return hypre_error_flag;
+}
+
+#endif // HYPRE_USING_SYCL
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
@@ -1570,4 +1630,3 @@ hypre_bind_device( HYPRE_Int myid,
 
    return hypre_error_flag;
 }
-
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index 59549ca6db..e8fa14cca2 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -53,6 +53,8 @@
 
 #elif defined(HYPRE_USING_SYCL)
 
+typedef sycl::range<1> dim3;
+
 /* WM: problems with this being inside extern C++ {} */
 /* #include <CL/sycl.hpp> */
 
@@ -335,17 +337,22 @@ struct hypre_GpuMatData
 #define hypre_GpuMatDataMatInfo(data)     ((data) -> mat_info)
 #define hypre_GpuMatDataSpMVBuffer(data)  ((data) -> spmv_buffer)
 
+/* device_utils.c, some common functions for CUDA, SYCL, HIP */
+
+dim3 hypre_GetDefaultDeviceBlockDimension();
+
+dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity,
+					  dim3 bDim );
+
+HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
+                                             HYPRE_Int *d_row_ind);
+
 #endif //#if defined(HYPRE_USING_GPU)
 
 #if defined(HYPRE_USING_SYCL)
 
 /* device_utils.c */
 HYPRE_Int HYPRE_SetSYCLDevice(sycl::device user_device);
-sycl::range<1> hypre_GetDefaultDeviceBlockDimension();
-
-sycl::range<1> hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity,
-                                                    sycl::range<1> bDim );
-
 #endif // #if defined(HYPRE_USING_SYCL)
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
@@ -968,9 +975,6 @@ HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
 
 HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
 
-HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
-                                             HYPRE_Int *d_row_ind);
-
 HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
 
 HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
@@ -1005,20 +1009,19 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
 
 #if defined(HYPRE_USING_SYCL)
 
-#define PSTL_USE_PARALLEL_POLICIES 0 // for libstdc++ 9
-#define _GLIBCXX_USE_TBB_PAR_BACKEND 0 // for libstdc++ 10
+#pragma once
 
-// #include <oneapi/dpl/execution>
-// #include <oneapi/dpl/algorithm>
-// #include <oneapi/dpl/iterator>
-// #include <oneapi/dpl/functional>
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/iterator>
+#include <oneapi/dpl/functional>
 
-//#include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if
+#include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if, scatter_if
 
-// #include <algorithm>
-// #include <numeric>
-// #include <functional>
-// #include <iterator>
+#include <algorithm>
+#include <numeric>
+#include <functional>
+#include <iterator>
 
 #define __forceinline__ __inline__ __attribute__((always_inline))
 
@@ -1047,7 +1050,7 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
    }                                                                                         \
    else                                                                                      \
    {                                                                                         \
-     hypre_DeviceDataComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \
+     hypre_HandleComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \
         [=] (sycl::nd_item<1> item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]] {        \
            (kernel_name)(item, __VA_ARGS__);                                                 \
      });                                                                                     \
@@ -1058,7 +1061,7 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
  * The following one works OK for now */
 
 #define HYPRE_ONEDPL_CALL(func_name, ...) \
-  func_name(oneapi::dpl::execution::make_device_policy(*hypre_DeviceDataComputeStream(hypre_handle()), __VA_ARGS__);
+  func_name(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
 
 // /* return the number of threads in block */
 // template <hypre_int dim>
diff --git a/src/utilities/memory.h b/src/utilities/memory.h
index bd815020c1..6fcaa29a01 100644
--- a/src/utilities/memory.h
+++ b/src/utilities/memory.h
@@ -122,6 +122,11 @@ hypre_GetActualMemLocation(HYPRE_MemoryLocation location)
 #endif
    }
 
+   if (location == HYPRE_MEMORY_UNIFIED)
+   {
+      return hypre_MEMORY_UNIFIED;
+   }
+
    return hypre_MEMORY_UNDEFINED;
 }
 

From 9eb1f7f38d268d5400faede532f8f03cfaa0e735 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Fri, 10 Dec 2021 20:23:25 +0000
Subject: [PATCH 37/44] [SYCL] enable oneDPL and some more updates

---
 src/config/configure.in            |   2 +-
 src/configure                      |   2 +-
 src/struct_ls/pfmg_setup.c         |   5 -
 src/struct_mv/struct_innerprod.c   |   4 -
 src/utilities/_hypre_utilities.hpp |  15 +-
 src/utilities/device_utils.c       | 226 ++++++++++++++---------------
 src/utilities/device_utils.h       |  15 +-
 7 files changed, 130 insertions(+), 139 deletions(-)

diff --git a/src/config/configure.in b/src/config/configure.in
index 06e6a22796..8edcabc68c 100644
--- a/src/config/configure.in
+++ b/src/config/configure.in
@@ -2316,7 +2316,7 @@ AS_IF([test x"$hypre_using_sycl" == x"yes"],
 
         if test "$hypre_user_chose_cuflags" = "no"
         then
-           CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
+           CUFLAGS="-D_GLIBCXX_USE_TBB_PAR_BACKEND=0 -fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
            if test "$hypre_using_debug" = "yes"
            then
               CUFLAGS="-O0 -Wall -g ${CUFLAGS}"
diff --git a/src/configure b/src/configure
index 7993465afb..66d6707f63 100755
--- a/src/configure
+++ b/src/configure
@@ -9143,7 +9143,7 @@ $as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h
 
         if test "$hypre_user_chose_cuflags" = "no"
         then
-           CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
+           CUFLAGS="-D_GLIBCXX_USE_TBB_PAR_BACKEND=0 -fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
            if test "$hypre_using_debug" = "yes"
            then
               CUFLAGS="-O0 -Wall -g ${CUFLAGS}"
diff --git a/src/struct_ls/pfmg_setup.c b/src/struct_ls/pfmg_setup.c
index ad8afa5e1b..c30ba6d8e0 100644
--- a/src/struct_ls/pfmg_setup.c
+++ b/src/struct_ls/pfmg_setup.c
@@ -1695,7 +1695,6 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
       HYPRE_Real tcy = -diag * (a_cs[Ai] + a_cn[Ai] + a_an[Ai] + a_as[Ai] + a_bn[Ai] + a_bs[Ai] +
                                 a_csw[Ai] + a_cse[Ai] + a_cnw[Ai] + a_cne[Ai]);
       cyb += tcy;
-#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, cyb)
 
@@ -1707,7 +1706,6 @@ hypre_PFMGComputeDxyz_SS19( HYPRE_Int           bi,
       HYPRE_Real tcz = -diag * (a_ac[Ai] + a_bc[Ai] + a_aw[Ai] + a_ae[Ai] + a_an[Ai] + a_as[Ai] +
                                 a_bw[Ai]  + a_be[Ai] +  a_bn[Ai] +  a_bs[Ai]);
       czb += tcz;
-#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, czb)
 
@@ -1995,7 +1993,6 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       tcx -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] +
                      a_bne[Ai]);
       cxb += tcx;
-#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, cxb)
 
@@ -2010,7 +2007,6 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       tcy -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] +
                      a_bne[Ai]);
       cyb += tcy;
-#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, cyb)
 
@@ -2025,7 +2021,6 @@ hypre_PFMGComputeDxyz_SS27( HYPRE_Int           bi,
       tcz -= diag * (a_asw[Ai] + a_ase[Ai] + a_anw[Ai] + a_ane[Ai] + a_bsw[Ai] + a_bse[Ai] + a_bnw[Ai] +
                      a_bne[Ai]);
       czb += tcz;
-#endif
    }
    hypre_BoxLoop1ReductionEnd(Ai, czb)
 
diff --git a/src/struct_mv/struct_innerprod.c b/src/struct_mv/struct_innerprod.c
index a32c06e0e4..81d8d27f70 100644
--- a/src/struct_mv/struct_innerprod.c
+++ b/src/struct_mv/struct_innerprod.c
@@ -89,11 +89,7 @@ hypre_StructInnerProd( hypre_StructVector *x,
                                    box_sum)
       {
          HYPRE_Real tmp = xp[xi] * hypre_conj(yp[yi]);
-#if defined(HYPRE_USING_SYCL)
-         hypre_sycl_sum += tmp;
-#else
          box_sum += tmp;
-#endif
       }
       hypre_BoxLoop2ReductionEnd(xi, yi, box_sum);
 
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index ee4ece0d96..277d4b9176 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -111,6 +111,7 @@ struct hypre_device_allocator
 #elif defined(HYPRE_USING_SYCL)
 
 typedef sycl::range<1> dim3;
+#define __global__
 
 /* WM: problems with this being inside extern C++ {} */
 /* #include <CL/sycl.hpp> */
@@ -404,6 +405,13 @@ dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity,
 HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
                                              HYPRE_Int *d_row_ind);
 
+HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
+
+HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
+
+HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
+                                             HYPRE_Int *d_row_ptr);
+
 #endif //#if defined(HYPRE_USING_GPU)
 
 #if defined(HYPRE_USING_SYCL)
@@ -1030,13 +1038,6 @@ HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
 
 HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
 
-HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
-
-HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
-
-HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
-                                             HYPRE_Int *d_row_ptr);
-
 HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y,
                                     char *work);
 
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index d66c9a500d..85721a8145 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -8,6 +8,109 @@
 #include "_hypre_utilities.h"
 #include "_hypre_utilities.hpp"
 
+// some common kernels for CUDA, HIP and SYCL
+#ifdef HYPRE_USING_GPU
+
+/**
+ * Get NNZ of each row in d_row_indices and stored the results in d_rownnz
+ * All pointers are device pointers.
+ * d_rownnz can be the same as d_row_indices
+ */
+__global__ void
+hypreGPUKernel_GetRowNnz(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1> item,
+  #endif
+  HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia,
+  HYPRE_Int *d_offd_ia,
+  HYPRE_Int *d_rownnz)
+{
+
+#ifdef HYPRE_USING_CUDA
+   const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1, 1>();
+#elif defined(HYPRE_USING_SYCL)
+   const HYPRE_Int global_thread_id = hypre_gpu_get_grid_thread_id<1,1>(item);
+#endif
+
+   if (global_thread_id < nrows)
+   {
+      HYPRE_Int i;
+
+      if (d_row_indices)
+      {
+         i = read_only_load(&d_row_indices[global_thread_id]);
+      }
+      else
+      {
+         i = global_thread_id;
+      }
+
+      d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i + 1]) - read_only_load(&d_diag_ia[i]) +
+                                   read_only_load(&d_offd_ia[i + 1]) - read_only_load(&d_offd_ia[i]);
+   }
+}
+
+HYPRE_Int*
+hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind)
+{
+   HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE);
+
+#ifdef HYPRE_USING_CUDA
+   HYPRE_THRUST_CALL( lower_bound,
+                      d_row_ind, d_row_ind + nnz,
+                      thrust::counting_iterator<HYPRE_Int>(0),
+                      thrust::counting_iterator<HYPRE_Int>(nrows + 1),
+                      d_row_ptr);
+#elif defined(HYPRE_USING_SYCL)
+   HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
+                      d_row_ind, d_row_ind + nnz,
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(0),
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(nrows + 1),
+                      d_row_ptr);
+#endif
+
+   return d_row_ptr;
+}
+
+HYPRE_Int
+hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
+                                   HYPRE_Int *d_row_ptr)
+{
+#ifdef HYPRE_USING_CUDA
+   HYPRE_THRUST_CALL( lower_bound,
+                      d_row_ind, d_row_ind + nnz,
+                      thrust::counting_iterator<HYPRE_Int>(0),
+                      thrust::counting_iterator<HYPRE_Int>(nrows + 1),
+                      d_row_ptr);
+#elif defined(HYPRE_USING_SYCL)
+   HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
+                      d_row_ind, d_row_ind + nnz,
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(0),
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(nrows + 1),
+                      d_row_ptr);
+#endif
+
+   return hypre_error_flag;
+}
+
+HYPRE_Int*
+hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr)
+{
+   /* trivial case */
+   if (nrows <= 0 || nnz <= 0)
+   {
+      return NULL;
+   }
+
+   HYPRE_Int *d_row_ind = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
+
+   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, d_row_ind);
+
+   return d_row_ind;
+}
+
+#endif // HYPRE_USING_GPU
+
 #if defined(HYPRE_USING_SYCL)
 dim3 hypre_GetDefaultDeviceBlockDimension()
 {
@@ -43,38 +146,6 @@ dim3 hypre_GetDefaultDeviceGridDimension(HYPRE_Int n,
    return gDim;
 }
 
-
-// /**
-//  * Get NNZ of each row in d_row_indices and stored the results in d_rownnz
-//  * All pointers are device pointers.
-//  * d_rownnz can be the same as d_row_indices
-//  */
-// void
-// hypreCUDAKernel_GetRowNnz(sycl::nd_item<1>& item,
-// 			  HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia,
-//                           HYPRE_Int *d_offd_ia,
-//                           HYPRE_Int *d_rownnz)
-// {
-//   const HYPRE_Int global_thread_id = hypre_sycl_get_grid_thread_id<1, 1>(item);
-
-//   if (global_thread_id < nrows)
-//   {
-//     HYPRE_Int i;
-
-//     if (d_row_indices)
-//     {
-//       i = read_only_load(&d_row_indices[global_thread_id]);
-//     }
-//     else
-//     {
-//       i = global_thread_id;
-//     }
-
-//     d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i + 1]) - read_only_load(&d_diag_ia[i]) +
-//       read_only_load(&d_offd_ia[i + 1]) - read_only_load(&d_offd_ia[i]);
-//   }
-// }
-
 HYPRE_Int
 hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
                                    HYPRE_Int *d_row_ind)
@@ -87,14 +158,14 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_
 
    HYPRE_ONEDPL_CALL( std::fill, d_row_ind, d_row_ind + nnz, 0 );
 
-   // TODO: need to fix this by passing a "predicate" as last argument
-   HYPRE_ONEDPL_CALL( dpct::scatter_if,
-                      oneapi::dpl::counting_iterator<HYPRE_Int>(0),
-                      oneapi::dpl::counting_iterator<HYPRE_Int>(nrows),
-                      d_row_ptr,
-                      oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1),
-                                                            [](auto t) { return std::get<0>(t) != std::get<1>(t); } ),
-                      d_row_ind );
+   /* // TODO: need to fix this by passing a "predicate" as last argument */
+   /* HYPRE_ONEDPL_CALL( dpct::scatter_if, */
+   /*                    oneapi::dpl::counting_iterator<HYPRE_Int>(0), */
+   /*                    oneapi::dpl::counting_iterator<HYPRE_Int>(nrows), */
+   /*                    d_row_ptr, */
+   /*                    oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1), */
+   /*                                                          [](auto t) { return std::get<0>(t) != std::get<1>(t); } ), */
+   /*                    d_row_ind ); */
 
    HYPRE_ONEDPL_CALL( std::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind,
                       sycl::maximum<HYPRE_Int>() );
@@ -210,36 +281,6 @@ hypre_GetDefaultDeviceGridDimension( HYPRE_Int n,
    return gDim;
 }
 
-/**
- * Get NNZ of each row in d_row_indices and stored the results in d_rownnz
- * All pointers are device pointers.
- * d_rownnz can be the same as d_row_indices
- */
-__global__ void
-hypreCUDAKernel_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia,
-                          HYPRE_Int *d_offd_ia,
-                          HYPRE_Int *d_rownnz)
-{
-   const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1, 1>();
-
-   if (global_thread_id < nrows)
-   {
-      HYPRE_Int i;
-
-      if (d_row_indices)
-      {
-         i = read_only_load(&d_row_indices[global_thread_id]);
-      }
-      else
-      {
-         i = global_thread_id;
-      }
-
-      d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i + 1]) - read_only_load(&d_diag_ia[i]) +
-                                   read_only_load(&d_offd_ia[i + 1]) - read_only_load(&d_offd_ia[i]);
-   }
-}
-
 /* special case: if d_row_indices == NULL, it means d_row_indices=[0,1,...,nrows-1] */
 HYPRE_Int
 hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia,
@@ -255,7 +296,7 @@ hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_di
       return hypre_error_flag;
    }
 
-   HYPRE_GPU_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia,
+   HYPRE_GPU_LAUNCH( hypreGPUKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia,
                       d_offd_ia, d_rownnz );
 
    return hypre_error_flag;
@@ -465,22 +506,6 @@ struct hypre_empty_row_functor
    }
 };
 
-HYPRE_Int*
-hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr)
-{
-   /* trivial case */
-   if (nrows <= 0 || nnz <= 0)
-   {
-      return NULL;
-   }
-
-   HYPRE_Int *d_row_ind = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
-
-   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, d_row_ind);
-
-   return d_row_ind;
-}
-
 HYPRE_Int
 hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
                                    HYPRE_Int *d_row_ind)
@@ -539,33 +564,6 @@ template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HY
                                                              HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind);
 #endif
 
-HYPRE_Int*
-hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind)
-{
-   HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE);
-
-   HYPRE_THRUST_CALL( lower_bound,
-                      d_row_ind, d_row_ind + nnz,
-                      thrust::counting_iterator<HYPRE_Int>(0),
-                      thrust::counting_iterator<HYPRE_Int>(nrows + 1),
-                      d_row_ptr);
-
-   return d_row_ptr;
-}
-
-HYPRE_Int
-hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
-                                   HYPRE_Int *d_row_ptr)
-{
-   HYPRE_THRUST_CALL( lower_bound,
-                      d_row_ind, d_row_ind + nnz,
-                      thrust::counting_iterator<HYPRE_Int>(0),
-                      thrust::counting_iterator<HYPRE_Int>(nrows + 1),
-                      d_row_ptr);
-
-   return hypre_error_flag;
-}
-
 __global__ void
 hypreCUDAKernel_ScatterAddTrivial(HYPRE_Int n, HYPRE_Real *x, HYPRE_Int *map, HYPRE_Real *y)
 {
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index e8fa14cca2..54d9c8a620 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -54,6 +54,7 @@
 #elif defined(HYPRE_USING_SYCL)
 
 typedef sycl::range<1> dim3;
+#define __global__
 
 /* WM: problems with this being inside extern C++ {} */
 /* #include <CL/sycl.hpp> */
@@ -347,6 +348,13 @@ dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity,
 HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
                                              HYPRE_Int *d_row_ind);
 
+HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
+
+HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
+
+HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
+                                             HYPRE_Int *d_row_ptr);
+
 #endif //#if defined(HYPRE_USING_GPU)
 
 #if defined(HYPRE_USING_SYCL)
@@ -973,13 +981,6 @@ HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
 
 HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
 
-HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
-
-HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
-
-HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
-                                             HYPRE_Int *d_row_ptr);
-
 HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y,
                                     char *work);
 

From f11b593119a7104899ca5c7fa989c5bb4752ea70 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Tue, 14 Dec 2021 09:58:53 +0000
Subject: [PATCH 38/44] [SYCL] adding sycl::gather and few more common GPU
 functions

---
 src/utilities/_hypre_utilities.hpp |  42 ++++++--
 src/utilities/device_utils.c       | 157 ++++++++++++++++-------------
 src/utilities/device_utils.h       |  42 ++++++--
 3 files changed, 149 insertions(+), 92 deletions(-)

diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index 277d4b9176..24530a4d38 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -412,6 +412,16 @@ HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE
 HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
                                              HYPRE_Int *d_row_ptr);
 
+HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i);
+
+HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
+
+HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
+
+template <typename T>
+HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
+                                                    HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind);
+
 #endif //#if defined(HYPRE_USING_GPU)
 
 #if defined(HYPRE_USING_SYCL)
@@ -1017,10 +1027,6 @@ hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *val
 template <typename T1, typename T2, typename T3> HYPRE_Int hypreDevice_ReduceByTupleKey(HYPRE_Int N,
                                                                                         T1 *keys1_in,  T2 *keys2_in,  T3 *vals_in, T1 *keys1_out, T2 *keys2_out, T3 *vals_out);
 
-template <typename T>
-HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
-                                                    HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind);
-
 template <typename T>
 HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v);
 
@@ -1032,12 +1038,6 @@ HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices,
                                      HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j,
                                      HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab);
 
-HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i);
-
-HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
-
-HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
-
 HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y,
                                     char *work);
 
@@ -1089,6 +1089,28 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
  * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  */
 
+template <typename InputIter1, typename InputIter2,
+          typename OutputIter>
+OutputIter hypreSycl_gather(InputIter1 map_first, InputIter1 map_last,
+                  InputIter2 input_first, OutputIter result) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<InputIter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<InputIter2>::iterator_category,
+              std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<OutputIter>::iterator_category,
+              std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto perm_begin =
+      oneapi::dpl::make_permutation_iterator(input_first, map_first);
+  const int n = ::std::distance(map_first, map_last);
+
+  return oneapi::dpl::copy(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())),
+			   perm_begin, perm_begin + n, result);
+}
+
 #if defined(HYPRE_DEBUG)
 #if defined(HYPRE_USING_CUDA)
 #define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index 85721a8145..bf961145fd 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -25,11 +25,10 @@ hypreGPUKernel_GetRowNnz(
   HYPRE_Int *d_offd_ia,
   HYPRE_Int *d_rownnz)
 {
-
-#ifdef HYPRE_USING_CUDA
-   const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1, 1>();
-#elif defined(HYPRE_USING_SYCL)
+#if defined(HYPRE_USING_SYCL)
    const HYPRE_Int global_thread_id = hypre_gpu_get_grid_thread_id<1,1>(item);
+#else
+   const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1, 1>();
 #endif
 
    if (global_thread_id < nrows)
@@ -55,18 +54,18 @@ hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row
 {
    HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE);
 
-#ifdef HYPRE_USING_CUDA
-   HYPRE_THRUST_CALL( lower_bound,
-                      d_row_ind, d_row_ind + nnz,
-                      thrust::counting_iterator<HYPRE_Int>(0),
-                      thrust::counting_iterator<HYPRE_Int>(nrows + 1),
-                      d_row_ptr);
-#elif defined(HYPRE_USING_SYCL)
+#if defined(HYPRE_USING_SYCL)
    HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
                       d_row_ind, d_row_ind + nnz,
                       oneapi::dpl::counting_iterator<HYPRE_Int>(0),
                       oneapi::dpl::counting_iterator<HYPRE_Int>(nrows + 1),
                       d_row_ptr);
+#else
+   HYPRE_THRUST_CALL( lower_bound,
+                      d_row_ind, d_row_ind + nnz,
+                      thrust::counting_iterator<HYPRE_Int>(0),
+                      thrust::counting_iterator<HYPRE_Int>(nrows + 1),
+                      d_row_ptr);
 #endif
 
    return d_row_ptr;
@@ -76,18 +75,18 @@ HYPRE_Int
 hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
                                    HYPRE_Int *d_row_ptr)
 {
-#ifdef HYPRE_USING_CUDA
-   HYPRE_THRUST_CALL( lower_bound,
-                      d_row_ind, d_row_ind + nnz,
-                      thrust::counting_iterator<HYPRE_Int>(0),
-                      thrust::counting_iterator<HYPRE_Int>(nrows + 1),
-                      d_row_ptr);
-#elif defined(HYPRE_USING_SYCL)
+#if defined(HYPRE_USING_SYCL)
    HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
                       d_row_ind, d_row_ind + nnz,
                       oneapi::dpl::counting_iterator<HYPRE_Int>(0),
                       oneapi::dpl::counting_iterator<HYPRE_Int>(nrows + 1),
                       d_row_ptr);
+#else
+   HYPRE_THRUST_CALL( lower_bound,
+                      d_row_ind, d_row_ind + nnz,
+                      thrust::counting_iterator<HYPRE_Int>(0),
+                      thrust::counting_iterator<HYPRE_Int>(nrows + 1),
+                      d_row_ptr);
 #endif
 
    return hypre_error_flag;
@@ -109,6 +108,73 @@ hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row
    return d_row_ind;
 }
 
+HYPRE_Int
+hypreDevice_IntegerReduceSum(HYPRE_Int n, HYPRE_Int *d_i)
+{
+#ifdef HYPRE_USING_SYCL
+   return HYPRE_ONEDPL_CALL(oneapi::dpl::reduce, d_i, d_i + n);
+#else
+   return HYPRE_THRUST_CALL(reduce, d_i, d_i + n);
+#endif
+}
+
+HYPRE_Int
+hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
+{
+#if defined(HYPRE_USING_SYCL)
+   HYPRE_ONEDPL_CALL(oneapi::dpl::inclusive_scan, d_i, d_i + n, d_i);
+#else
+   HYPRE_THRUST_CALL(inclusive_scan, d_i, d_i + n, d_i);
+#endif
+   return hypre_error_flag;
+}
+
+HYPRE_Int
+hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
+{
+#if defined(HYPRE_USING_SYCL)
+   HYPRE_ONEDPL_CALL(oneapi::dpl::exclusive_scan, d_i, d_i + n, d_i);
+#else
+   HYPRE_THRUST_CALL(exclusive_scan, d_i, d_i + n, d_i);
+#endif
+   return hypre_error_flag;
+}
+
+/* Input: d_row_num, of size nrows, contains the rows indices that can be BigInt or Int
+ * Output: d_row_ind */
+template <typename T>
+HYPRE_Int
+hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
+                                          T *d_row_num, T *d_row_ind)
+{
+   /* trivial case */
+   if (nrows <= 0)
+   {
+      return hypre_error_flag;
+   }
+
+   HYPRE_Int *map = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
+
+   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, map);
+
+#ifdef HYPRE_USING_SYCL
+   hypreSycl_gather(map, map + nnz, d_row_num, d_row_ind);
+#else
+   HYPRE_THRUST_CALL(gather, map, map + nnz, d_row_num, d_row_ind);
+#endif
+
+   hypre_TFree(map, HYPRE_MEMORY_DEVICE);
+
+   return hypre_error_flag;
+}
+
+template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
+                                                             HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind);
+#if defined(HYPRE_MIXEDINT)
+template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
+                                                             HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind);
+#endif
+
 #endif // HYPRE_USING_GPU
 
 #if defined(HYPRE_USING_SYCL)
@@ -167,7 +233,7 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_
    /*                                                          [](auto t) { return std::get<0>(t) != std::get<1>(t); } ), */
    /*                    d_row_ind ); */
 
-   HYPRE_ONEDPL_CALL( std::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind,
+   HYPRE_ONEDPL_CALL( oneapi::dpl::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind,
                       sycl::maximum<HYPRE_Int>() );
 
    return hypre_error_flag;
@@ -445,28 +511,6 @@ hypreDevice_CopyParCSRRows(HYPRE_Int      nrows,
    return hypre_error_flag;
 }
 
-HYPRE_Int
-hypreDevice_IntegerReduceSum(HYPRE_Int n, HYPRE_Int *d_i)
-{
-   return HYPRE_THRUST_CALL(reduce, d_i, d_i + n);
-}
-
-HYPRE_Int
-hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
-{
-   HYPRE_THRUST_CALL(inclusive_scan, d_i, d_i + n, d_i);
-
-   return hypre_error_flag;
-}
-
-HYPRE_Int
-hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
-{
-   HYPRE_THRUST_CALL(exclusive_scan, d_i, d_i + n, d_i);
-
-   return hypre_error_flag;
-}
-
 HYPRE_Int
 hypreDevice_Scalen(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v)
 {
@@ -533,37 +577,6 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_
    return hypre_error_flag;
 }
 
-/* Input: d_row_num, of size nrows, contains the rows indices that can be BigInt or Int
- * Output: d_row_ind */
-template <typename T>
-HYPRE_Int
-hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
-                                          T *d_row_num, T *d_row_ind)
-{
-   /* trivial case */
-   if (nrows <= 0)
-   {
-      return hypre_error_flag;
-   }
-
-   HYPRE_Int *map = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
-
-   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, map);
-
-   HYPRE_THRUST_CALL(gather, map, map + nnz, d_row_num, d_row_ind);
-
-   hypre_TFree(map, HYPRE_MEMORY_DEVICE);
-
-   return hypre_error_flag;
-}
-
-template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
-                                                             HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind);
-#if defined(HYPRE_MIXEDINT)
-template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
-                                                             HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind);
-#endif
-
 __global__ void
 hypreCUDAKernel_ScatterAddTrivial(HYPRE_Int n, HYPRE_Real *x, HYPRE_Int *map, HYPRE_Real *y)
 {
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index 54d9c8a620..54cc6e192e 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -355,6 +355,16 @@ HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE
 HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
                                              HYPRE_Int *d_row_ptr);
 
+HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i);
+
+HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
+
+HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
+
+template <typename T>
+HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
+                                                    HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind);
+
 #endif //#if defined(HYPRE_USING_GPU)
 
 #if defined(HYPRE_USING_SYCL)
@@ -960,10 +970,6 @@ hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *val
 template <typename T1, typename T2, typename T3> HYPRE_Int hypreDevice_ReduceByTupleKey(HYPRE_Int N,
                                                                                         T1 *keys1_in,  T2 *keys2_in,  T3 *vals_in, T1 *keys1_out, T2 *keys2_out, T3 *vals_out);
 
-template <typename T>
-HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
-                                                    HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind);
-
 template <typename T>
 HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v);
 
@@ -975,12 +981,6 @@ HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices,
                                      HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j,
                                      HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab);
 
-HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i);
-
-HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
-
-HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
-
 HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y,
                                     char *work);
 
@@ -1032,6 +1032,28 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
  * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  */
 
+template <typename InputIter1, typename InputIter2,
+          typename OutputIter>
+OutputIter hypreSycl_gather(InputIter1 map_first, InputIter1 map_last,
+                  InputIter2 input_first, OutputIter result) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<InputIter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<InputIter2>::iterator_category,
+              std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<OutputIter>::iterator_category,
+              std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto perm_begin =
+      oneapi::dpl::make_permutation_iterator(input_first, map_first);
+  const int n = ::std::distance(map_first, map_last);
+
+  return oneapi::dpl::copy(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())),
+			   perm_begin, perm_begin + n, result);
+}
+
 #if defined(HYPRE_DEBUG)
 #if defined(HYPRE_USING_CUDA)
 #define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }

From ae30f749c62cfac4234876775a61bcc094cdb6fd Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Tue, 14 Dec 2021 16:22:25 +0000
Subject: [PATCH 39/44] [SYCL] fix the sycl scatter_if

---
 src/utilities/device_utils.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index bf961145fd..c0dad8de28 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -224,14 +224,14 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_
 
    HYPRE_ONEDPL_CALL( std::fill, d_row_ind, d_row_ind + nnz, 0 );
 
-   /* // TODO: need to fix this by passing a "predicate" as last argument */
-   /* HYPRE_ONEDPL_CALL( dpct::scatter_if, */
-   /*                    oneapi::dpl::counting_iterator<HYPRE_Int>(0), */
-   /*                    oneapi::dpl::counting_iterator<HYPRE_Int>(nrows), */
-   /*                    d_row_ptr, */
-   /*                    oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1), */
-   /*                                                          [](auto t) { return std::get<0>(t) != std::get<1>(t); } ), */
-   /*                    d_row_ind ); */
+   HYPRE_ONEDPL_CALL( dpct::scatter_if,
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(0),
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(nrows),
+                      d_row_ptr,
+                      oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1),
+                                                            [](auto t) { return std::get<0>(t) != std::get<1>(t); } ),
+                      d_row_ind,
+                      oneapi::dpl::identity() );
 
    HYPRE_ONEDPL_CALL( oneapi::dpl::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind,
                       sycl::maximum<HYPRE_Int>() );

From c73ef06e14389cfa841369edb93f727ac2209710 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Tue, 14 Dec 2021 17:29:37 +0000
Subject: [PATCH 40/44] [SYCL] fix the build issues from std::exclusive_scan,
 lambda for scatter_if

---
 src/utilities/device_utils.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index c0dad8de28..a75f9be6d2 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -133,7 +133,7 @@ HYPRE_Int
 hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
 {
 #if defined(HYPRE_USING_SYCL)
-   HYPRE_ONEDPL_CALL(oneapi::dpl::exclusive_scan, d_i, d_i + n, d_i);
+   HYPRE_ONEDPL_CALL(std::exclusive_scan, d_i, d_i + n, d_i, 0, std::plus<>());
 #else
    HYPRE_THRUST_CALL(exclusive_scan, d_i, d_i + n, d_i);
 #endif
@@ -212,6 +212,16 @@ dim3 hypre_GetDefaultDeviceGridDimension(HYPRE_Int n,
    return gDim;
 }
 
+struct hypre_empty_row_functor
+{
+   bool operator()(const std::tuple<HYPRE_Int, HYPRE_Int>& t) const
+   {
+      const HYPRE_Int a = std::get<0>(t);
+      const HYPRE_Int b = std::get<1>(t);
+      return a != b;
+   }
+};
+
 HYPRE_Int
 hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
                                    HYPRE_Int *d_row_ind)
@@ -229,7 +239,7 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_
                       oneapi::dpl::counting_iterator<HYPRE_Int>(nrows),
                       d_row_ptr,
                       oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1),
-                                                            [](auto t) { return std::get<0>(t) != std::get<1>(t); } ),
+                                                            hypre_empty_row_functor() ),
                       d_row_ind,
                       oneapi::dpl::identity() );
 

From d3e3bf028664434c61f48c1285d12c9ede049db2 Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Thu, 16 Dec 2021 19:33:29 +0000
Subject: [PATCH 41/44] [SYCL] cleanup a for SYCL kernel query helper functions

---
 src/utilities/_hypre_utilities.hpp | 16 +++++++++++-----
 src/utilities/device_utils.h       | 16 +++++++++++-----
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index 24530a4d38..23f05e1ad2 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -112,6 +112,8 @@ struct hypre_device_allocator
 
 typedef sycl::range<1> dim3;
 #define __global__
+#define __host__
+#define __device__
 
 /* WM: problems with this being inside extern C++ {} */
 /* #include <CL/sycl.hpp> */
@@ -1164,7 +1166,7 @@ OutputIter hypreSycl_gather(InputIter1 map_first, InputIter1 map_last,
 /* return the number of (sub_groups) warps in (work-group) block */
 template <hypre_int dim>
 static __forceinline__
-hypre_int hypre_gpu_get_num_warps(sycl::nd_item<1>& item)
+hypre_int hypre_gpu_get_num_warps(sycl::nd_item<dim>& item)
 {
    return item.get_sub_group().get_group_range().get(0);
 }
@@ -1172,9 +1174,9 @@ hypre_int hypre_gpu_get_num_warps(sycl::nd_item<1>& item)
 /* return the thread lane id in warp */
 template <hypre_int dim>
 static __forceinline__
-hypre_int hypre_gpu_get_lane_id(sycl::nd_item<1>& item)
+hypre_int hypre_gpu_get_lane_id(sycl::nd_item<dim>& item)
 {
-   return item.get_local_linear_id() & (HYPRE_WARP_SIZE-1);
+   return item.get_sub_group().get_local_linear_id();
 }
 
 // /* return the number of threads in grid */
@@ -1185,11 +1187,15 @@ hypre_int hypre_gpu_get_lane_id(sycl::nd_item<1>& item)
 //    return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_threads<bdim>();
 // }
 
-/* return the flattened work-item/thread id in global work space */
-template <hypre_int bdim, hypre_int gdim>
+/* return the flattened work-item/thread id in global work space,
+ * Note: Since the use-cases always involved bdim = gdim = 1, the
+ * sycl:;nd_item<1> is only being used. SFINAE is used to prevent
+ * other dimensions (i.e., bdim != gdim != 1) */
+template < hypre_int bdim, hypre_int gdim >
 static __forceinline__
 hypre_int hypre_gpu_get_grid_thread_id(sycl::nd_item<1>& item)
 {
+   static_assert(bdim == 1 && gdim == 1);
    return item.get_global_id(0);
 }
 
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index 54cc6e192e..dc85ff3f44 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -55,6 +55,8 @@
 
 typedef sycl::range<1> dim3;
 #define __global__
+#define __host__
+#define __device__
 
 /* WM: problems with this being inside extern C++ {} */
 /* #include <CL/sycl.hpp> */
@@ -1107,7 +1109,7 @@ OutputIter hypreSycl_gather(InputIter1 map_first, InputIter1 map_last,
 /* return the number of (sub_groups) warps in (work-group) block */
 template <hypre_int dim>
 static __forceinline__
-hypre_int hypre_gpu_get_num_warps(sycl::nd_item<1>& item)
+hypre_int hypre_gpu_get_num_warps(sycl::nd_item<dim>& item)
 {
    return item.get_sub_group().get_group_range().get(0);
 }
@@ -1115,9 +1117,9 @@ hypre_int hypre_gpu_get_num_warps(sycl::nd_item<1>& item)
 /* return the thread lane id in warp */
 template <hypre_int dim>
 static __forceinline__
-hypre_int hypre_gpu_get_lane_id(sycl::nd_item<1>& item)
+hypre_int hypre_gpu_get_lane_id(sycl::nd_item<dim>& item)
 {
-   return item.get_local_linear_id() & (HYPRE_WARP_SIZE-1);
+   return item.get_sub_group().get_local_linear_id();
 }
 
 // /* return the number of threads in grid */
@@ -1128,11 +1130,15 @@ hypre_int hypre_gpu_get_lane_id(sycl::nd_item<1>& item)
 //    return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_threads<bdim>();
 // }
 
-/* return the flattened work-item/thread id in global work space */
-template <hypre_int bdim, hypre_int gdim>
+/* return the flattened work-item/thread id in global work space,
+ * Note: Since the use-cases always involved bdim = gdim = 1, the
+ * sycl:;nd_item<1> is only being used. SFINAE is used to prevent
+ * other dimensions (i.e., bdim != gdim != 1) */
+template < hypre_int bdim, hypre_int gdim >
 static __forceinline__
 hypre_int hypre_gpu_get_grid_thread_id(sycl::nd_item<1>& item)
 {
+   static_assert(bdim == 1 && gdim == 1);
    return item.get_global_id(0);
 }
 

From ad32b6f15c775e8e9c7966edaa4f80683e4c94ee Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Tue, 21 Dec 2021 15:53:27 +0000
Subject: [PATCH 42/44] [SYCL] simplify namespace for
 sycl::ext::oneapi::sub_group to sycl::sub_group

---
 src/seq_mv/csr_matop_device.c      | 8 ++++----
 src/utilities/_hypre_utilities.hpp | 6 +++---
 src/utilities/device_utils.h       | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c
index cd6e819515..55fde144e3 100644
--- a/src/seq_mv/csr_matop_device.c
+++ b/src/seq_mv/csr_matop_device.c
@@ -579,7 +579,7 @@ hypreGPUKernel_CSRMatrixFixZeroDiagDevice( sycl::nd_item<1>& item,
   {
     p = read_only_load(ia + row + lane);
   }
-  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  sycl::sub_group SG = item.get_sub_group();
   q = SG.shuffle(p, 1);
   p = SG.shuffle(p, 0);
 
@@ -634,7 +634,7 @@ hypreGPUKernel_CSRMatrixReplaceDiagDevice( sycl::nd_item<1>& item,
   {
     p = read_only_load(ia + row + lane);
   }
-  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  sycl::sub_group SG = item.get_sub_group();
   q = SG.shuffle(p, 1);
   p = SG.shuffle(p, 0);
 
@@ -697,7 +697,7 @@ hypreGPUKernel_CSRRowSum( sycl::nd_item<1>& item,
     p = read_only_load(ia + row_i + lane);
   }
 
-  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  sycl::sub_group SG = item.get_sub_group();
   q = SG.shuffle(p, 1);
   p = SG.shuffle(p, 0);
 
@@ -769,7 +769,7 @@ hypreGPUKernel_CSRExtractDiag( sycl::nd_item<1>& item,
   {
     p = read_only_load(ia + row + lane);
   }
-  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  sycl::sub_group SG = item.get_sub_group();
   q = SG.shuffle(p, 1);
   p = SG.shuffle(p, 0);
 
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index 23f05e1ad2..1a6f392e84 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -1285,11 +1285,11 @@ T read_only_load( const T *ptr )
 //    return in;
 // }
 
-template <typename T>
+template <typename T, int DIM>
 static __forceinline__
-T warp_reduce_sum(T in, sycl::nd_item<1>& item)
+T warp_reduce_sum(T in, sycl::nd_item<DIM>& item)
 {
-  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  sycl::sub_group SG = item.get_sub_group();
   //sycl::ext::oneapi::reduce(SG, in, std::plus<T>());
 #pragma unroll
   for (hypre_int d = SG.get_local_range().get(0)/2; d > 0; d >>= 1)
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index dc85ff3f44..45006f9097 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -1228,11 +1228,11 @@ T read_only_load( const T *ptr )
 //    return in;
 // }
 
-template <typename T>
+template <typename T, int DIM>
 static __forceinline__
-T warp_reduce_sum(T in, sycl::nd_item<1>& item)
+T warp_reduce_sum(T in, sycl::nd_item<DIM>& item)
 {
-  sycl::ext::oneapi::sub_group SG = item.get_sub_group();
+  sycl::sub_group SG = item.get_sub_group();
   //sycl::ext::oneapi::reduce(SG, in, std::plus<T>());
 #pragma unroll
   for (hypre_int d = SG.get_local_range().get(0)/2; d > 0; d >>= 1)

From 9c6b6bc9988286869023c5514483720a2ded0ddd Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Tue, 21 Dec 2021 20:27:43 +0000
Subject: [PATCH 43/44] [SYCL] unify code for CUDA, HIP and SYCL for easier
 maintanence

---
 src/seq_mv/csr_matop_device.c | 3113 +++++++++++++--------------------
 1 file changed, 1204 insertions(+), 1909 deletions(-)

diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c
index 55fde144e3..bacc0b28fe 100644
--- a/src/seq_mv/csr_matop_device.c
+++ b/src/seq_mv/csr_matop_device.c
@@ -108,1531 +108,404 @@ hypre_GpuMatDataDestroy(hypre_GpuMatData *data)
 
 #endif /* #if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) */
 
-/* ABB: All the compute kernel implementations are grouped here */
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
-__global__ void
-hypreGPUKernel_CSRMoveDiagFirst( HYPRE_Int      nrows,
-                                 HYPRE_Int     *ia,
-                                 HYPRE_Int     *ja,
-                                 HYPRE_Complex *aa )
+HYPRE_Int
+hypre_CSRMatrixSplitDevice_core( HYPRE_Int         job,                 /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */
+                                 HYPRE_Int         num_rows,
+                                 HYPRE_Int         B_ext_nnz,
+                                 HYPRE_Int        *B_ext_ii,            /* Note: this is NOT row pointers as in CSR but row indices as in COO */
+                                 HYPRE_BigInt     *B_ext_bigj,          /* Note: [BigInt] global column indices */
+                                 HYPRE_Complex    *B_ext_data,
+                                 char             *B_ext_xata,          /* companion data with B_ext_data; NULL if none */
+                                 HYPRE_BigInt      first_col_diag_B,
+                                 HYPRE_BigInt      last_col_diag_B,
+                                 HYPRE_Int         num_cols_offd_B,
+                                 HYPRE_BigInt     *col_map_offd_B,
+                                 HYPRE_Int       **map_B_to_C_ptr,
+                                 HYPRE_Int        *num_cols_offd_C_ptr,
+                                 HYPRE_BigInt    **col_map_offd_C_ptr,
+                                 HYPRE_Int        *B_ext_diag_nnz_ptr,
+                                 HYPRE_Int        *B_ext_diag_ii,       /* memory allocated outside */
+                                 HYPRE_Int        *B_ext_diag_j,
+                                 HYPRE_Complex    *B_ext_diag_data,
+                                 char             *B_ext_diag_xata,     /* companion with B_ext_diag_data_ptr; NULL if none */
+                                 HYPRE_Int        *B_ext_offd_nnz_ptr,
+                                 HYPRE_Int        *B_ext_offd_ii,       /* memory allocated outside */
+                                 HYPRE_Int        *B_ext_offd_j,
+                                 HYPRE_Complex    *B_ext_offd_data,
+                                 char             *B_ext_offd_xata      /* companion with B_ext_offd_data_ptr; NULL if none */ )
 {
-  HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_Int      B_ext_diag_nnz;
+  HYPRE_Int      B_ext_offd_nnz;
+  HYPRE_BigInt  *B_ext_diag_bigj = NULL;
+  HYPRE_BigInt  *B_ext_offd_bigj = NULL;
+  HYPRE_BigInt  *col_map_offd_C;
+  HYPRE_Int     *map_B_to_C = NULL;
+  HYPRE_Int      num_cols_offd_C;
 
-  if (row >= nrows)
+  in_range<HYPRE_BigInt> pred1(first_col_diag_B, last_col_diag_B);
+
+  /* get diag and offd nnz */
+  if (job == 0)
   {
-    return;
-  }
+    /* query the nnz's */
+    B_ext_diag_nnz = HYPRE_THRUST_CALL( count_if,
+                                        B_ext_bigj,
+                                        B_ext_bigj + B_ext_nnz,
+                                        pred1 );
+    B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz;
 
-  HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-  HYPRE_Int p = 0, q = 0;
+    *B_ext_diag_nnz_ptr = B_ext_diag_nnz;
+    *B_ext_offd_nnz_ptr = B_ext_offd_nnz;
 
-  if (lane < 2)
+    return hypre_error_flag;
+  }
+  else
   {
-    p = read_only_load(ia + row + lane);
+    B_ext_diag_nnz = *B_ext_diag_nnz_ptr;
+    B_ext_offd_nnz = *B_ext_offd_nnz_ptr;
   }
-  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
 
-  for (HYPRE_Int j = p + lane + 1; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
-  {
-    hypre_int find_diag = j < q && ja[j] == row;
+  /* copy to diag */
+  B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
 
-    if (find_diag)
-    {
-      ja[j] = ja[p];
-      ja[p] = row;
-      HYPRE_Complex tmp = aa[p];
-      aa[p] = aa[j];
-      aa[j] = tmp;
-    }
+  if (B_ext_diag_xata)
+  {
+    auto new_end = HYPRE_THRUST_CALL(
+      copy_if,
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata)),             /* first */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata)) + B_ext_nnz, /* last */
+      B_ext_bigj,                                                                                                          /* stencil */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata)),     /* result */
+      pred1 );
 
-    if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
-    {
-      break;
-    }
+    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz );
   }
-}
-
-/* check if diagonal entry is the first one at each row
- * Return: the number of rows that do not have the first entry as diagonal
- * RL: only check if it's a non-empty row
- */
-__global__ void
-hypreGPUKernel_CSRCheckDiagFirst( HYPRE_Int  nrows,
-                                  HYPRE_Int *ia,
-                                  HYPRE_Int *ja,
-                                  HYPRE_Int *result )
-{
-  const HYPRE_Int row = hypre_cuda_get_grid_thread_id<1,1>();
-  if (row < nrows)
+  else
   {
-    result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row);
+    auto new_end = HYPRE_THRUST_CALL(
+      copy_if,
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data)),             /* first */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data)) + B_ext_nnz, /* last */
+      B_ext_bigj,                                                                                            /* stencil */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data)),        /* result */
+      pred1 );
+
+    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz );
   }
-}
 
-__global__ void
-hypreGPUKernel_CSRMatrixFixZeroDiagDevice( HYPRE_Complex  v,
-                                           HYPRE_Int      nrows,
-                                           HYPRE_Int     *ia,
-                                           HYPRE_Int     *ja,
-                                           HYPRE_Complex *data,
-                                           HYPRE_Real     tol,
-                                           HYPRE_Int     *result )
-{
-  const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_THRUST_CALL( transform,
+                     B_ext_diag_bigj,
+                     B_ext_diag_bigj + B_ext_diag_nnz,
+                     thrust::make_constant_iterator(first_col_diag_B),
+                     B_ext_diag_j,
+                     thrust::minus<HYPRE_BigInt>());
 
-  if (row >= nrows)
-  {
-    return;
-  }
+  hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE);
 
-  HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-  HYPRE_Int p = 0, q = 0;
-  bool has_diag = false;
+  /* copy to offd */
+  B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
 
-  if (lane < 2)
+  if (B_ext_offd_xata)
   {
-    p = read_only_load(ia + row + lane);
-  }
-  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+    auto new_end = HYPRE_THRUST_CALL(
+      copy_if,
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata)),             /* first */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata)) + B_ext_nnz, /* last */
+      B_ext_bigj,                                                                                                          /* stencil */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata)),     /* result */
+      thrust::not1(pred1) );
 
-  for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
+  }
+  else
   {
-    hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
-
-    if (find_diag)
-    {
-      if (fabs(data[j]) <= tol)
-      {
-        data[j] = v;
-      }
-    }
+    auto new_end = HYPRE_THRUST_CALL(
+      copy_if,
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data)),             /* first */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data)) + B_ext_nnz, /* last */
+      B_ext_bigj,                                                                                            /* stencil */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data)),        /* result */
+      thrust::not1(pred1) );
 
-    if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
-    {
-      has_diag = true;
-      break;
-    }
+    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
   }
 
-  if (result && !has_diag && lane == 0)
-  {
-    result[row] = 1;
-  }
-}
+  /* offd map of B_ext_offd Union col_map_offd_B */
+  col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(col_map_offd_C,                  B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz,  HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B,  HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
 
-__global__ void
-hypreGPUKernel_CSRMatrixReplaceDiagDevice( HYPRE_Complex *new_diag,
-                                           HYPRE_Complex  v,
-                                           HYPRE_Int      nrows,
-                                           HYPRE_Int     *ia,
-                                           HYPRE_Int     *ja,
-                                           HYPRE_Complex *data,
-                                           HYPRE_Real     tol,
-                                           HYPRE_Int     *result )
-{
-  const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_THRUST_CALL( sort,
+                     col_map_offd_C,
+                     col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
 
-  if (row >= nrows)
-  {
-    return;
-  }
+  HYPRE_BigInt *new_end = HYPRE_THRUST_CALL( unique,
+                                             col_map_offd_C,
+                                             col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
 
-  HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-  HYPRE_Int p = 0, q = 0;
-  bool has_diag = false;
+  num_cols_offd_C = new_end - col_map_offd_C;
 
-  if (lane < 2)
-  {
-    p = read_only_load(ia + row + lane);
-  }
-  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+#if 1
+  HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE);
+  col_map_offd_C = tmp;
+#else
+  col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
+#endif
 
-  for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+  /* create map from col_map_offd_B */
+  if (num_cols_offd_B)
   {
-    hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
+    map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE);
+    HYPRE_THRUST_CALL( lower_bound,
+                       col_map_offd_C,
+                       col_map_offd_C + num_cols_offd_C,
+                       col_map_offd_B,
+                       col_map_offd_B + num_cols_offd_B,
+                       map_B_to_C );
+  }
 
-    if (find_diag)
-    {
-      HYPRE_Complex d = read_only_load(&new_diag[row]);
-      if (fabs(d) <= tol)
-      {
-        d = v;
-      }
-      data[j] = d;
-    }
+  HYPRE_THRUST_CALL( lower_bound,
+                     col_map_offd_C,
+                     col_map_offd_C + num_cols_offd_C,
+                     B_ext_offd_bigj,
+                     B_ext_offd_bigj + B_ext_offd_nnz,
+                     B_ext_offd_j );
 
-    if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
-    {
-      has_diag = true;
-      break;
-    }
-  }
+  hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE);
 
-  if (result && !has_diag && lane == 0)
+  if (map_B_to_C_ptr)
   {
-    result[row] = 1;
+    *map_B_to_C_ptr   = map_B_to_C;
   }
+  *num_cols_offd_C_ptr = num_cols_offd_C;
+  *col_map_offd_C_ptr  = col_map_offd_C;
+
+  return hypre_error_flag;
 }
 
-/* type == 0, sum,
- *         1, abs sum (l-1)
- *         2, square sum (l-2)
- */
-template<HYPRE_Int type>
-__global__ void
-hypreGPUKernel_CSRRowSum( HYPRE_Int      nrows,
-                          HYPRE_Int     *ia,
-                          HYPRE_Int     *ja,
-                          HYPRE_Complex *aa,
-                          HYPRE_Int     *CF_i,
-                          HYPRE_Int     *CF_j,
-                          HYPRE_Complex *row_sum,
-                          HYPRE_Complex  scal,
-                          HYPRE_Int      set)
+typedef thrust::tuple<HYPRE_Int, HYPRE_Int> Int2;
+struct Int2Unequal : public thrust::unary_function<Int2, bool>
 {
-  HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>();
-
-  if (row_i >= nrows)
+  __host__ __device__
+    bool operator()(const Int2& t) const
   {
-    return;
+    return (thrust::get<0>(t) != thrust::get<1>(t));
   }
+};
 
-  HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-  HYPRE_Int p = 0, q = 0;
-
-  if (lane < 2)
+/* this predicate compares first and second element in a tuple in absolute value */
+/* first is assumed to be complex, second to be real > 0 */
+struct cabsfirst_greaterthan_second_pred : public thrust::unary_function<thrust::tuple<HYPRE_Complex, HYPRE_Real>,bool>
+{
+  __host__ __device__
+    bool operator()(const thrust::tuple<HYPRE_Complex, HYPRE_Real>& t) const
   {
-    p = read_only_load(ia + row_i + lane);
+    const HYPRE_Complex i = thrust::get<0>(t);
+    const HYPRE_Real j = thrust::get<1>(t);
+
+    return hypre_cabs(i) > j;
   }
+};
 
-  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */
 
-  HYPRE_Complex row_sum_i = 0.0;
+#if defined(HYPRE_USING_SYCL)
 
-  for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) {
-    if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) )
-    {
-      continue;
-    }
+HYPRE_Int
+hypre_CSRMatrixSplitDevice_core( HYPRE_Int         job,                 /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */
+                                 HYPRE_Int         num_rows,
+                                 HYPRE_Int         B_ext_nnz,
+                                 HYPRE_Int        *B_ext_ii,            /* Note: this is NOT row pointers as in CSR but row indices as in COO */
+                                 HYPRE_BigInt     *B_ext_bigj,          /* Note: [BigInt] global column indices */
+                                 HYPRE_Complex    *B_ext_data,
+                                 char             *B_ext_xata,          /* companion data with B_ext_data; NULL if none */
+                                 HYPRE_BigInt      first_col_diag_B,
+                                 HYPRE_BigInt      last_col_diag_B,
+                                 HYPRE_Int         num_cols_offd_B,
+                                 HYPRE_BigInt     *col_map_offd_B,
+                                 HYPRE_Int       **map_B_to_C_ptr,
+                                 HYPRE_Int        *num_cols_offd_C_ptr,
+                                 HYPRE_BigInt    **col_map_offd_C_ptr,
+                                 HYPRE_Int        *B_ext_diag_nnz_ptr,
+                                 HYPRE_Int        *B_ext_diag_ii,       /* memory allocated outside */
+                                 HYPRE_Int        *B_ext_diag_j,
+                                 HYPRE_Complex    *B_ext_diag_data,
+                                 char             *B_ext_diag_xata,     /* companion with B_ext_diag_data_ptr; NULL if none */
+                                 HYPRE_Int        *B_ext_offd_nnz_ptr,
+                                 HYPRE_Int        *B_ext_offd_ii,       /* memory allocated outside */
+                                 HYPRE_Int        *B_ext_offd_j,
+                                 HYPRE_Complex    *B_ext_offd_data,
+                                 char             *B_ext_offd_xata      /* companion with B_ext_offd_data_ptr; NULL if none */ )
+{
+  HYPRE_Int      B_ext_diag_nnz;
+  HYPRE_Int      B_ext_offd_nnz;
+  HYPRE_BigInt  *B_ext_diag_bigj = NULL;
+  HYPRE_BigInt  *B_ext_offd_bigj = NULL;
+  HYPRE_BigInt  *col_map_offd_C;
+  HYPRE_Int     *map_B_to_C = NULL;
+  HYPRE_Int      num_cols_offd_C;
 
-    HYPRE_Complex aii = aa[j];
+  in_range<HYPRE_BigInt> pred1(first_col_diag_B, last_col_diag_B);
 
-    if (type == 0)
-    {
-      row_sum_i += aii;
-    }
-    else if (type == 1)
-    {
-      row_sum_i += fabs(aii);
-    }
-    else if (type == 2)
-    {
-      row_sum_i += aii * aii;
-    }
-  }
+  /* get diag and offd nnz */
+  if (job == 0) {
+    /* query the nnz's */
+    B_ext_diag_nnz = HYPRE_ONEDPL_CALL( std::count_if,
+                                        B_ext_bigj,
+                                        B_ext_bigj + B_ext_nnz,
+                                        pred1 );
+    B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz;
 
-  row_sum_i = warp_reduce_sum(row_sum_i);
+    *B_ext_diag_nnz_ptr = B_ext_diag_nnz;
+    *B_ext_offd_nnz_ptr = B_ext_offd_nnz;
 
-  if (lane == 0)
-  {
-    if (set)
-    {
-      row_sum[row_i] = scal * row_sum_i;
-    }
-    else
-    {
-      row_sum[row_i] += scal * row_sum_i;
-    }
+    return hypre_error_flag;
+  }
+  else {
+    B_ext_diag_nnz = *B_ext_diag_nnz_ptr;
+    B_ext_offd_nnz = *B_ext_offd_nnz_ptr;
   }
-}
 
-/* type 0: diag
- *      1: abs diag
- *      2: diag inverse
- *      3: diag inverse sqrt
- *      4: abs diag inverse sqrt
- */
-__global__ void
-hypreGPUKernel_CSRExtractDiag( HYPRE_Int      nrows,
-                               HYPRE_Int     *ia,
-                               HYPRE_Int     *ja,
-                               HYPRE_Complex *aa,
-                               HYPRE_Complex *d,
-                               HYPRE_Int      type)
-{
-  HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+  /* copy to diag */
+  B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
 
-  if (row >= nrows)
-  {
-    return;
-  }
+  if (B_ext_diag_xata) {
+    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first,                                                                                           /* first */
+                                      first + B_ext_nnz,                                                                               /* last */
+                                      B_ext_bigj,                                                                                      /* stencil */
+                                      oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata),/* result */
+                                      pred1 );
 
-  HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-  HYPRE_Int p = 0, q = 0;
+    //hypre_assert( std::get<0>(new_end.get_iterator_tuple() == B_ext_diag_ii + B_ext_diag_nnz );
+  }
+  else {
+    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first,                                                                             /* first */
+                                      first + B_ext_nnz,                                                                 /* last */
+                                      B_ext_bigj,                                                                        /* stencil */
+                                      oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data),   /* result */
+                                      pred1 );
 
-  if (lane < 2)
-  {
-    p = read_only_load(ia + row + lane);
+    //hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz );
   }
-  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
 
-  HYPRE_Int has_diag = 0;
+  HYPRE_BigInt *const_iterator = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+  hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, first_col_diag_B, B_ext_diag_nnz*sizeof(HYPRE_BigInt)).wait();
+  HYPRE_ONEDPL_CALL( std::transform,
+                     B_ext_diag_bigj,
+                     B_ext_diag_bigj + B_ext_diag_nnz,
+                     const_iterator, //dpct::make_constant_iterator(first_col_diag_B),
+                     B_ext_diag_j,
+                     std::minus<HYPRE_BigInt>() );
+  hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE);
 
-  for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
-  {
-    hypre_int find_diag = j < q && ja[j] == row;
+  hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE);
 
-    if (find_diag)
-    {
-      if (type == 0)
-      {
-        d[row] = aa[j];
-      }
-      else if (type == 1)
-      {
-        d[row] = fabs(aa[j]);
-      }
-      else if (type == 2)
-      {
-        d[row] = 1.0 / aa[j];
-      }
-      else if (type == 3)
-      {
-        d[row] = 1.0 / sqrt(aa[j]);
-      }
-      else if (type == 4)
-      {
-        d[row] = 1.0 / sqrt(fabs(aa[j]));
-      }
-    }
+  /* copy to offd */
+  B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
 
-    if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
-    {
-      has_diag = 1;
-      break;
-    }
+  if (B_ext_offd_xata) {
+    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first,                                                                                            /* first */
+                                      first + B_ext_nnz,                                                                                /* last */
+                                      B_ext_bigj,                                                                                       /* stencil */
+                                      oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata), /* result */
+                                      std::not_fn(pred1) );
+
+    // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
   }
+  else {
+    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first,                                                                           /* first */
+                                      first + B_ext_nnz,                                                               /* last */
+                                      B_ext_bigj,                                                                      /* stencil */
+                                      oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data), /* result */
+                                      std::not_fn(pred1) );
 
-  if (!has_diag && lane == 0)
-  {
-    d[row] = 0.0;
+    // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
   }
-}
 
-/* mark is of size nA
- * diag_option: 1: special treatment for diag entries, mark as -2
- */
-__global__ void
-hypreGPUKernel_CSRMatrixIntersectPattern(HYPRE_Int  n,
-                                         HYPRE_Int  nA,
-                                         HYPRE_Int *rowid,
-                                         HYPRE_Int *colid,
-                                         HYPRE_Int *idx,
-                                         HYPRE_Int *mark,
-                                         HYPRE_Int  diag_option)
-{
-  HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
+  /* offd map of B_ext_offd Union col_map_offd_B */
+  col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(col_map_offd_C,                  B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz,  HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B,  HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
 
-  if (i >= n)
-  {
-    return;
-  }
+  HYPRE_ONEDPL_CALL( std::sort,
+                     col_map_offd_C,
+                     col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
 
-  HYPRE_Int r1 = read_only_load(&rowid[i]);
-  HYPRE_Int c1 = read_only_load(&colid[i]);
-  HYPRE_Int j = read_only_load(&idx[i]);
+  HYPRE_BigInt *new_end = HYPRE_ONEDPL_CALL( std::unique,
+                                             col_map_offd_C,
+                                             col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
 
-  if (0 == diag_option)
-  {
-    if (j < nA)
-    {
-      HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
-      HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
-      if (r1 == r2 && c1 == c2)
-      {
-        mark[j] = c1;
-      }
-      else
-      {
-        mark[j] = -1;
-      }
-    }
-  }
-  else if (1 == diag_option)
-  {
-    if (j < nA)
-    {
-      if (r1 == c1)
-      {
-        mark[j] = -2;
-      }
-      else
-      {
-        HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
-        HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
-        if (r1 == r2 && c1 == c2)
-        {
-          mark[j] = c1;
-        }
-        else
-        {
-          mark[j] = -1;
-        }
-      }
-    }
-  }
-}
-
-#elif defined(HYPRE_USING_SYCL)
-
-void
-hypreGPUKernel_CSRMoveDiagFirst( sycl::nd_item<1>& item,
-                                 HYPRE_Int      nrows,
-                                 HYPRE_Int     *ia,
-                                 HYPRE_Int     *ja,
-                                 HYPRE_Complex *aa )
-{
-  HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item);
-
-  if (row >= nrows)
-  {
-    return;
-  }
-
-  HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item);
-  HYPRE_Int p = 0, q = 0;
-
-  if (lane < 2)
-  {
-    p = read_only_load(ia + row + lane);
-  }
-  sycl::sub_group SG = item.get_sub_group();
-  q = SG.shuffle(p, 1);
-  p = SG.shuffle(p, 0);
-
-  for (HYPRE_Int j = p + lane + 1; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE)
-  {
-    hypre_int find_diag = j < q && ja[j] == row;
-
-    if (find_diag)
-    {
-      ja[j] = ja[p];
-      ja[p] = row;
-      HYPRE_Complex tmp = aa[p];
-      aa[p] = aa[j];
-      aa[j] = tmp;
-    }
-
-    if ( sycl::any_of_group(SG, find_diag) )
-    {
-      break;
-    }
-  }
-}
-
-/* check if diagonal entry is the first one at each row
- * Return: the number of rows that do not have the first entry as diagonal
- * RL: only check if it's a non-empty row
- */
-void
-hypreGPUKernel_CSRCheckDiagFirst( sycl::nd_item<1>& item,
-                                  HYPRE_Int  nrows,
-                                  HYPRE_Int *ia,
-                                  HYPRE_Int *ja,
-                                  HYPRE_Int *result )
-{
-  const HYPRE_Int row = hypre_gpu_get_grid_thread_id<1,1>(item);
-  if (row < nrows)
-  {
-    result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row);
-  }
-}
-
-void
-hypreGPUKernel_CSRMatrixFixZeroDiagDevice( sycl::nd_item<1>& item,
-                                           HYPRE_Complex  v,
-                                           HYPRE_Int      nrows,
-                                           HYPRE_Int     *ia,
-                                           HYPRE_Int     *ja,
-                                           HYPRE_Complex *data,
-                                           HYPRE_Real     tol,
-                                           HYPRE_Int     *result )
-{
-  const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item);
-
-  if (row >= nrows)
-  {
-    return;
-  }
-
-  HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item);
-  HYPRE_Int p = 0, q = 0;
-  bool has_diag = false;
-
-  if (lane < 2)
-  {
-    p = read_only_load(ia + row + lane);
-  }
-  sycl::sub_group SG = item.get_sub_group();
-  q = SG.shuffle(p, 1);
-  p = SG.shuffle(p, 0);
-
-  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE)
-  {
-    hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
-
-    if (find_diag)
-    {
-      if (fabs(data[j]) <= tol)
-      {
-        data[j] = v;
-      }
-    }
-
-    if ( sycl::any_of_group(SG, find_diag) )
-    {
-      has_diag = true;
-      break;
-    }
-  }
-
-  if (result && !has_diag && lane == 0)
-  {
-    result[row] = 1;
-  }
-}
-
-void
-hypreGPUKernel_CSRMatrixReplaceDiagDevice( sycl::nd_item<1>& item,
-                                           HYPRE_Complex *new_diag,
-                                           HYPRE_Complex  v,
-                                           HYPRE_Int      nrows,
-                                           HYPRE_Int     *ia,
-                                           HYPRE_Int     *ja,
-                                           HYPRE_Complex *data,
-                                           HYPRE_Real     tol,
-                                           HYPRE_Int     *result )
-{
-  const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item);
-
-  if (row >= nrows)
-  {
-    return;
-  }
-
-  HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item);
-  HYPRE_Int p = 0, q = 0;
-  bool has_diag = false;
-
-  if (lane < 2)
-  {
-    p = read_only_load(ia + row + lane);
-  }
-  sycl::sub_group SG = item.get_sub_group();
-  q = SG.shuffle(p, 1);
-  p = SG.shuffle(p, 0);
-
-  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE)
-  {
-    hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
-
-    if (find_diag)
-    {
-      HYPRE_Complex d = read_only_load(&new_diag[row]);
-      if (fabs(d) <= tol)
-      {
-        d = v;
-      }
-      data[j] = d;
-    }
-
-    if ( sycl::any_of_group(SG, find_diag) )
-    {
-      has_diag = true;
-      break;
-    }
-  }
-
-  if (result && !has_diag && lane == 0)
-  {
-    result[row] = 1;
-  }
-}
-
-/* type == 0, sum,
- *         1, abs sum (l-1)
- *         2, square sum (l-2)
- */
-template<HYPRE_Int type>
-void
-hypreGPUKernel_CSRRowSum( sycl::nd_item<1>& item,
-                          HYPRE_Int      nrows,
-                          HYPRE_Int     *ia,
-                          HYPRE_Int     *ja,
-                          HYPRE_Complex *aa,
-                          HYPRE_Int     *CF_i,
-                          HYPRE_Int     *CF_j,
-                          HYPRE_Complex *row_sum,
-                          HYPRE_Complex  scal,
-                          HYPRE_Int      set)
-{
-  HYPRE_Int row_i = hypre_gpu_get_grid_warp_id<1,1>(item);
-
-  if (row_i >= nrows)
-  {
-    return;
-  }
-
-  HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item);
-  HYPRE_Int p = 0, q = 0;
-
-  if (lane < 2)
-  {
-    p = read_only_load(ia + row_i + lane);
-  }
-
-  sycl::sub_group SG = item.get_sub_group();
-  q = SG.shuffle(p, 1);
-  p = SG.shuffle(p, 0);
-
-  HYPRE_Complex row_sum_i = 0.0;
-
-  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE) {
-    if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) )
-    {
-      continue;
-    }
-
-    HYPRE_Complex aii = aa[j];
-
-    if (type == 0)
-    {
-      row_sum_i += aii;
-    }
-    else if (type == 1)
-    {
-      row_sum_i += fabs(aii);
-    }
-    else if (type == 2)
-    {
-      row_sum_i += aii * aii;
-    }
-  }
-
-  row_sum_i = warp_reduce_sum(row_sum_i, item);
-
-  if (lane == 0)
-  {
-    if (set)
-    {
-      row_sum[row_i] = scal * row_sum_i;
-    }
-    else
-    {
-      row_sum[row_i] += scal * row_sum_i;
-    }
-  }
-}
-
-/* type 0: diag
- *      1: abs diag
- *      2: diag inverse
- *      3: diag inverse sqrt
- *      4: abs diag inverse sqrt
- */
-void
-hypreGPUKernel_CSRExtractDiag( sycl::nd_item<1>& item,
-                               HYPRE_Int      nrows,
-                               HYPRE_Int     *ia,
-                               HYPRE_Int     *ja,
-                               HYPRE_Complex *aa,
-                               HYPRE_Complex *d,
-                               HYPRE_Int      type)
-{
-  HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item);
-
-  if (row >= nrows)
-  {
-    return;
-  }
-
-  HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item);
-  HYPRE_Int p = 0, q = 0;
-
-  if (lane < 2)
-  {
-    p = read_only_load(ia + row + lane);
-  }
-  sycl::sub_group SG = item.get_sub_group();
-  q = SG.shuffle(p, 1);
-  p = SG.shuffle(p, 0);
-
-  HYPRE_Int has_diag = 0;
-
-  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += HYPRE_WARP_SIZE)
-  {
-    hypre_int find_diag = j < q && ja[j] == row;
-
-    if (find_diag)
-    {
-      if (type == 0)
-      {
-        d[row] = aa[j];
-      }
-      else if (type == 1)
-      {
-        d[row] = fabs(aa[j]);
-      }
-      else if (type == 2)
-      {
-        d[row] = 1.0 / aa[j];
-      }
-      else if (type == 3)
-      {
-        d[row] = 1.0 / sqrt(aa[j]);
-      }
-      else if (type == 4)
-      {
-        d[row] = 1.0 / sqrt(fabs(aa[j]));
-      }
-    }
-
-    if ( sycl::any_of_group(SG, find_diag) )
-    {
-      has_diag = 1;
-      break;
-    }
-  }
-
-  if (!has_diag && lane == 0)
-  {
-    d[row] = 0.0;
-  }
-}
-
-/* mark is of size nA
- * diag_option: 1: special treatment for diag entries, mark as -2
- */
-void
-hypreGPUKernel_CSRMatrixIntersectPattern( sycl::nd_item<1>& item,
-                                          HYPRE_Int  n,
-                                          HYPRE_Int  nA,
-                                          HYPRE_Int *rowid,
-                                          HYPRE_Int *colid,
-                                          HYPRE_Int *idx,
-                                          HYPRE_Int *mark,
-                                          HYPRE_Int  diag_option)
-{
-  HYPRE_Int i = hypre_gpu_get_grid_thread_id<1,1>(item);
-
-  if (i >= n)
-  {
-    return;
-  }
-
-  HYPRE_Int r1 = read_only_load(&rowid[i]);
-  HYPRE_Int c1 = read_only_load(&colid[i]);
-  HYPRE_Int j = read_only_load(&idx[i]);
-
-  if (0 == diag_option)
-  {
-    if (j < nA)
-    {
-      HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
-      HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
-      if (r1 == r2 && c1 == c2)
-      {
-        mark[j] = c1;
-      }
-      else
-      {
-        mark[j] = -1;
-      }
-    }
-  }
-  else if (1 == diag_option)
-  {
-    if (j < nA)
-    {
-      if (r1 == c1)
-      {
-        mark[j] = -2;
-      }
-      else
-      {
-        HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
-        HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
-        if (r1 == r2 && c1 == c2)
-        {
-          mark[j] = c1;
-        }
-        else
-        {
-          mark[j] = -1;
-        }
-      }
-    }
-  }
-}
-
-#endif // HYPRE_USING_SYCL
-
-
-#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
-
-hypre_CSRMatrix*
-hypre_CSRMatrixAddDevice ( HYPRE_Complex    alpha,
-                           hypre_CSRMatrix *A,
-                           HYPRE_Complex    beta,
-                           hypre_CSRMatrix *B     )
-{
-  HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
-  HYPRE_Int        *A_i      = hypre_CSRMatrixI(A);
-  HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
-  HYPRE_Int         nrows_A  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
-  HYPRE_Int         nnz_A    = hypre_CSRMatrixNumNonzeros(A);
-  HYPRE_Complex    *B_data   = hypre_CSRMatrixData(B);
-  HYPRE_Int        *B_i      = hypre_CSRMatrixI(B);
-  HYPRE_Int        *B_j      = hypre_CSRMatrixJ(B);
-  HYPRE_Int         nrows_B  = hypre_CSRMatrixNumRows(B);
-  HYPRE_Int         ncols_B  = hypre_CSRMatrixNumCols(B);
-  HYPRE_Int         nnz_B    = hypre_CSRMatrixNumNonzeros(B);
-  HYPRE_Complex    *C_data;
-  HYPRE_Int        *C_i;
-  HYPRE_Int        *C_j;
-  HYPRE_Int         nnzC;
-  hypre_CSRMatrix  *C;
-
-  if (nrows_A != nrows_B || ncols_A != ncols_B)
-  {
-    hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! Incompatible matrix dimensions!\n");
-
-    return NULL;
-  }
-
-  hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B,
-                       A_i, A_j, alpha, A_data, NULL, B_i, B_j, beta, B_data, NULL, NULL,
-                       &nnzC, &C_i, &C_j, &C_data);
-
-  C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC);
-  hypre_CSRMatrixI(C) = C_i;
-  hypre_CSRMatrixJ(C) = C_j;
-  hypre_CSRMatrixData(C) = C_data;
-  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return C;
-}
-
-hypre_CSRMatrix*
-hypre_CSRMatrixMultiplyDevice( hypre_CSRMatrix *A,
-                               hypre_CSRMatrix *B)
-{
-  HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
-  HYPRE_Int         nrows_B  = hypre_CSRMatrixNumRows(B);
-  hypre_CSRMatrix  *C;
-
-  if (ncols_A != nrows_B)
-  {
-    hypre_printf("Warning! incompatible matrix dimensions!\n");
-    hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! incompatible matrix dimensions!\n");
-
-    return NULL;
-  }
-
-  hypreDevice_CSRSpGemm(A, B, &C);
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return C;
-}
-
-hypre_CSRMatrix*
-hypre_CSRMatrixTripleMultiplyDevice ( hypre_CSRMatrix *A,
-                                      hypre_CSRMatrix *B,
-                                      hypre_CSRMatrix *C )
-{
-  hypre_CSRMatrix *BC  = hypre_CSRMatrixMultiplyDevice(B, C);
-  hypre_CSRMatrix *ABC = hypre_CSRMatrixMultiplyDevice(A, BC);
-
-  hypre_CSRMatrixDestroy(BC);
-
-  return ABC;
-}
-
-HYPRE_Int
-hypre_CSRMatrixTriLowerUpperSolveDevice(char             uplo,
-                                        hypre_CSRMatrix *A,
-                                        HYPRE_Real      *l1_norms,
-                                        hypre_Vector    *f,
-                                        hypre_Vector    *u )
-{
-#if defined(HYPRE_USING_CUSPARSE)
-  hypre_CSRMatrixTriLowerUpperSolveCusparse(uplo, A, l1_norms, f, u);
-#elif defined(HYPRE_USING_ROCSPARSE)
-  hypre_CSRMatrixTriLowerUpperSolveRocsparse(uplo, A, l1_norms, f, u);
-#else
-  hypre_error_w_msg(HYPRE_ERROR_GENERIC, "hypre_CSRMatrixTriLowerUpperSolveDevice requires configuration with either cusparse or rocsparse\n");
-#endif
-  return hypre_error_flag;
-}
-
-/* split CSR matrix B_ext (extended rows of parcsr B) into diag part and offd part
- * corresponding to B.
- * Input  col_map_offd_B:
- * Output col_map_offd_C: union of col_map_offd_B and offd-indices of Bext_offd
- *        map_B_to_C: mapping from col_map_offd_B to col_map_offd_C
- */
-
-HYPRE_Int
-hypre_CSRMatrixSplitDevice( hypre_CSRMatrix  *B_ext,
-                            HYPRE_BigInt      first_col_diag_B,
-                            HYPRE_BigInt      last_col_diag_B,
-                            HYPRE_Int         num_cols_offd_B,
-                            HYPRE_BigInt     *col_map_offd_B,
-                            HYPRE_Int       **map_B_to_C_ptr,
-                            HYPRE_Int        *num_cols_offd_C_ptr,
-                            HYPRE_BigInt    **col_map_offd_C_ptr,
-                            hypre_CSRMatrix **B_ext_diag_ptr,
-                            hypre_CSRMatrix **B_ext_offd_ptr )
-{
-  HYPRE_Int num_rows = hypre_CSRMatrixNumRows(B_ext);
-  HYPRE_Int B_ext_nnz = hypre_CSRMatrixNumNonzeros(B_ext);
-
-  HYPRE_Int *B_ext_ii = hypre_TAlloc(HYPRE_Int, B_ext_nnz, HYPRE_MEMORY_DEVICE);
-  hypreDevice_CsrRowPtrsToIndices_v2(num_rows, B_ext_nnz, hypre_CSRMatrixI(B_ext), B_ext_ii);
-
-  HYPRE_Int B_ext_diag_nnz;
-  HYPRE_Int B_ext_offd_nnz;
-  HYPRE_Int ierr;
-
-  ierr = hypre_CSRMatrixSplitDevice_core( 0,
-                                          num_rows,
-                                          B_ext_nnz,
-                                          NULL,
-                                          hypre_CSRMatrixBigJ(B_ext),
-                                          NULL,
-                                          NULL,
-                                          first_col_diag_B,
-                                          last_col_diag_B,
-                                          num_cols_offd_B,
-                                          NULL,
-                                          NULL,
-                                          NULL,
-                                          NULL,
-                                          &B_ext_diag_nnz,
-                                          NULL,
-                                          NULL,
-                                          NULL,
-                                          NULL,
-                                          &B_ext_offd_nnz,
-                                          NULL,
-                                          NULL,
-                                          NULL,
-                                          NULL );
-
-  HYPRE_Int     *B_ext_diag_ii = hypre_TAlloc(HYPRE_Int,     B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-  HYPRE_Int     *B_ext_diag_j  = hypre_TAlloc(HYPRE_Int,     B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-  HYPRE_Complex *B_ext_diag_a  = hypre_TAlloc(HYPRE_Complex, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-
-  HYPRE_Int     *B_ext_offd_ii = hypre_TAlloc(HYPRE_Int,     B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
-  HYPRE_Int     *B_ext_offd_j  = hypre_TAlloc(HYPRE_Int,     B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
-  HYPRE_Complex *B_ext_offd_a  = hypre_TAlloc(HYPRE_Complex, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
-
-  ierr = hypre_CSRMatrixSplitDevice_core( 1,
-                                          num_rows,
-                                          B_ext_nnz,
-                                          B_ext_ii,
-                                          hypre_CSRMatrixBigJ(B_ext),
-                                          hypre_CSRMatrixData(B_ext),
-                                          NULL,
-                                          first_col_diag_B,
-                                          last_col_diag_B,
-                                          num_cols_offd_B,
-                                          col_map_offd_B,
-                                          map_B_to_C_ptr,
-                                          num_cols_offd_C_ptr,
-                                          col_map_offd_C_ptr,
-                                          &B_ext_diag_nnz,
-                                          B_ext_diag_ii,
-                                          B_ext_diag_j,
-                                          B_ext_diag_a,
-                                          NULL,
-                                          &B_ext_offd_nnz,
-                                          B_ext_offd_ii,
-                                          B_ext_offd_j,
-                                          B_ext_offd_a,
-                                          NULL );
-
-  hypre_TFree(B_ext_ii, HYPRE_MEMORY_DEVICE);
-
-  /* convert to row ptrs */
-  HYPRE_Int *B_ext_diag_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_diag_nnz, B_ext_diag_ii);
-  HYPRE_Int *B_ext_offd_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_offd_nnz, B_ext_offd_ii);
-
-  hypre_TFree(B_ext_diag_ii, HYPRE_MEMORY_DEVICE);
-  hypre_TFree(B_ext_offd_ii, HYPRE_MEMORY_DEVICE);
-
-  /* create diag and offd CSR */
-  hypre_CSRMatrix *B_ext_diag = hypre_CSRMatrixCreate(num_rows, last_col_diag_B - first_col_diag_B + 1, B_ext_diag_nnz);
-  hypre_CSRMatrix *B_ext_offd = hypre_CSRMatrixCreate(num_rows, *num_cols_offd_C_ptr, B_ext_offd_nnz);
-
-  hypre_CSRMatrixI(B_ext_diag) = B_ext_diag_i;
-  hypre_CSRMatrixJ(B_ext_diag) = B_ext_diag_j;
-  hypre_CSRMatrixData(B_ext_diag) = B_ext_diag_a;
-  hypre_CSRMatrixNumNonzeros(B_ext_diag) = B_ext_diag_nnz;
-  hypre_CSRMatrixMemoryLocation(B_ext_diag) = HYPRE_MEMORY_DEVICE;
-
-  hypre_CSRMatrixI(B_ext_offd) = B_ext_offd_i;
-  hypre_CSRMatrixJ(B_ext_offd) = B_ext_offd_j;
-  hypre_CSRMatrixData(B_ext_offd) = B_ext_offd_a;
-  hypre_CSRMatrixNumNonzeros(B_ext_offd) = B_ext_offd_nnz;
-  hypre_CSRMatrixMemoryLocation(B_ext_offd) = HYPRE_MEMORY_DEVICE;
-
-  *B_ext_diag_ptr = B_ext_diag;
-  *B_ext_offd_ptr = B_ext_offd;
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return ierr;
-}
-
-HYPRE_Int
-hypre_CSRMatrixSplitDevice_core( HYPRE_Int         job,                 /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */
-                                 HYPRE_Int         num_rows,
-                                 HYPRE_Int         B_ext_nnz,
-                                 HYPRE_Int        *B_ext_ii,            /* Note: this is NOT row pointers as in CSR but row indices as in COO */
-                                 HYPRE_BigInt     *B_ext_bigj,          /* Note: [BigInt] global column indices */
-                                 HYPRE_Complex    *B_ext_data,
-                                 char             *B_ext_xata,          /* companion data with B_ext_data; NULL if none */
-                                 HYPRE_BigInt      first_col_diag_B,
-                                 HYPRE_BigInt      last_col_diag_B,
-                                 HYPRE_Int         num_cols_offd_B,
-                                 HYPRE_BigInt     *col_map_offd_B,
-                                 HYPRE_Int       **map_B_to_C_ptr,
-                                 HYPRE_Int        *num_cols_offd_C_ptr,
-                                 HYPRE_BigInt    **col_map_offd_C_ptr,
-                                 HYPRE_Int        *B_ext_diag_nnz_ptr,
-                                 HYPRE_Int        *B_ext_diag_ii,       /* memory allocated outside */
-                                 HYPRE_Int        *B_ext_diag_j,
-                                 HYPRE_Complex    *B_ext_diag_data,
-                                 char             *B_ext_diag_xata,     /* companion with B_ext_diag_data_ptr; NULL if none */
-                                 HYPRE_Int        *B_ext_offd_nnz_ptr,
-                                 HYPRE_Int        *B_ext_offd_ii,       /* memory allocated outside */
-                                 HYPRE_Int        *B_ext_offd_j,
-                                 HYPRE_Complex    *B_ext_offd_data,
-                                 char             *B_ext_offd_xata      /* companion with B_ext_offd_data_ptr; NULL if none */ )
-{
-  HYPRE_Int      B_ext_diag_nnz;
-  HYPRE_Int      B_ext_offd_nnz;
-  HYPRE_BigInt  *B_ext_diag_bigj = NULL;
-  HYPRE_BigInt  *B_ext_offd_bigj = NULL;
-  HYPRE_BigInt  *col_map_offd_C;
-  HYPRE_Int     *map_B_to_C = NULL;
-  HYPRE_Int      num_cols_offd_C;
-
-  in_range<HYPRE_BigInt> pred1(first_col_diag_B, last_col_diag_B);
-
-  /* get diag and offd nnz */
-  if (job == 0)
-  {
-    /* query the nnz's */
-    B_ext_diag_nnz = HYPRE_THRUST_CALL( count_if,
-                                        B_ext_bigj,
-                                        B_ext_bigj + B_ext_nnz,
-                                        pred1 );
-    B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz;
-
-    *B_ext_diag_nnz_ptr = B_ext_diag_nnz;
-    *B_ext_offd_nnz_ptr = B_ext_offd_nnz;
-
-    return hypre_error_flag;
-  }
-  else
-  {
-    B_ext_diag_nnz = *B_ext_diag_nnz_ptr;
-    B_ext_offd_nnz = *B_ext_offd_nnz_ptr;
-  }
-
-  /* copy to diag */
-  B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-
-  if (B_ext_diag_xata)
-  {
-    auto new_end = HYPRE_THRUST_CALL(
-      copy_if,
-      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata)),             /* first */
-      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata)) + B_ext_nnz, /* last */
-      B_ext_bigj,                                                                                                          /* stencil */
-      thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata)),     /* result */
-      pred1 );
-
-    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz );
-  }
-  else
-  {
-    auto new_end = HYPRE_THRUST_CALL(
-      copy_if,
-      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data)),             /* first */
-      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data)) + B_ext_nnz, /* last */
-      B_ext_bigj,                                                                                            /* stencil */
-      thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data)),        /* result */
-      pred1 );
-
-    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz );
-  }
-
-  HYPRE_THRUST_CALL( transform,
-                     B_ext_diag_bigj,
-                     B_ext_diag_bigj + B_ext_diag_nnz,
-                     thrust::make_constant_iterator(first_col_diag_B),
-                     B_ext_diag_j,
-                     thrust::minus<HYPRE_BigInt>());
-
-  hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE);
-
-  /* copy to offd */
-  B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
-
-  if (B_ext_offd_xata)
-  {
-    auto new_end = HYPRE_THRUST_CALL(
-      copy_if,
-      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata)),             /* first */
-      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata)) + B_ext_nnz, /* last */
-      B_ext_bigj,                                                                                                          /* stencil */
-      thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata)),     /* result */
-      thrust::not1(pred1) );
-
-    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
-  }
-  else
-  {
-    auto new_end = HYPRE_THRUST_CALL(
-      copy_if,
-      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data)),             /* first */
-      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data)) + B_ext_nnz, /* last */
-      B_ext_bigj,                                                                                            /* stencil */
-      thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data)),        /* result */
-      thrust::not1(pred1) );
-
-    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
-  }
-
-  /* offd map of B_ext_offd Union col_map_offd_B */
-  col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(col_map_offd_C,                  B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz,  HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B,  HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-
-  HYPRE_THRUST_CALL( sort,
-                     col_map_offd_C,
-                     col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
-
-  HYPRE_BigInt *new_end = HYPRE_THRUST_CALL( unique,
-                                             col_map_offd_C,
-                                             col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
-
-  num_cols_offd_C = new_end - col_map_offd_C;
-
-#if 1
-  HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE);
-  col_map_offd_C = tmp;
-#else
-  col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
-#endif
-
-  /* create map from col_map_offd_B */
-  if (num_cols_offd_B)
-  {
-    map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE);
-    HYPRE_THRUST_CALL( lower_bound,
-                       col_map_offd_C,
-                       col_map_offd_C + num_cols_offd_C,
-                       col_map_offd_B,
-                       col_map_offd_B + num_cols_offd_B,
-                       map_B_to_C );
-  }
-
-  HYPRE_THRUST_CALL( lower_bound,
-                     col_map_offd_C,
-                     col_map_offd_C + num_cols_offd_C,
-                     B_ext_offd_bigj,
-                     B_ext_offd_bigj + B_ext_offd_nnz,
-                     B_ext_offd_j );
-
-  hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE);
-
-  if (map_B_to_C_ptr)
-  {
-    *map_B_to_C_ptr   = map_B_to_C;
-  }
-  *num_cols_offd_C_ptr = num_cols_offd_C;
-  *col_map_offd_C_ptr  = col_map_offd_C;
-
-  return hypre_error_flag;
-}
-
-/*--------------------------------------------------------------------------
- * hypre_CSRMatrixAddPartial:
- * adds matrix rows in the CSR matrix B to the CSR Matrix A, where row_nums[i]
- * defines to which row of A the i-th row of B is added, and returns a CSR Matrix C;
- * Repeated row indices are allowed in row_nums
- * Note: The routine does not check for 0-elements which might be generated
- *       through cancellation of elements in A and B or already contained
- *       in A and B. To remove those, use hypre_CSRMatrixDeleteZeros
- *--------------------------------------------------------------------------*/
-
-hypre_CSRMatrix*
-hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A,
-                                 hypre_CSRMatrix *B,
-                                 HYPRE_Int       *row_nums)
-{
-  HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
-  HYPRE_Int        *A_i      = hypre_CSRMatrixI(A);
-  HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
-  HYPRE_Int         nrows_A  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
-  HYPRE_Int         nnz_A    = hypre_CSRMatrixNumNonzeros(A);
-  HYPRE_Complex    *B_data   = hypre_CSRMatrixData(B);
-  HYPRE_Int        *B_i      = hypre_CSRMatrixI(B);
-  HYPRE_Int        *B_j      = hypre_CSRMatrixJ(B);
-  HYPRE_Int         nrows_B  = hypre_CSRMatrixNumRows(B);
-  HYPRE_Int         ncols_B  = hypre_CSRMatrixNumCols(B);
-  HYPRE_Int         nnz_B    = hypre_CSRMatrixNumNonzeros(B);
-  HYPRE_Complex    *C_data;
-  HYPRE_Int        *C_i;
-  HYPRE_Int        *C_j;
-  HYPRE_Int         nnzC;
-  hypre_CSRMatrix  *C;
-
-  if (ncols_A != ncols_B)
-  {
-    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! Incompatible matrix dimensions!\n");
-
-    return NULL;
-  }
-
-  hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B, A_i, A_j, 1.0, A_data, NULL, B_i, B_j, 1.0, B_data, NULL, row_nums,
-                       &nnzC, &C_i, &C_j, &C_data);
-
-  C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC);
-  hypre_CSRMatrixI(C) = C_i;
-  hypre_CSRMatrixJ(C) = C_j;
-  hypre_CSRMatrixData(C) = C_data;
-  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return C;
-}
-
-HYPRE_Int
-hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix  *A,
-                                 HYPRE_Real       *colnnz)
-{
-  HYPRE_Int *A_j      = hypre_CSRMatrixJ(A);
-  HYPRE_Int  ncols_A  = hypre_CSRMatrixNumCols(A);
-  HYPRE_Int  nnz_A    = hypre_CSRMatrixNumNonzeros(A);
-  HYPRE_Int *A_j_sorted;
-  HYPRE_Int  num_reduced_col_indices;
-  HYPRE_Int *reduced_col_indices;
-  HYPRE_Int *reduced_col_nnz;
-
-  A_j_sorted = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(A_j_sorted, A_j, HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  HYPRE_THRUST_CALL(sort, A_j_sorted, A_j_sorted + nnz_A);
-
-  reduced_col_indices = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
-  reduced_col_nnz     = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
-
-  thrust::pair<HYPRE_Int*, HYPRE_Int*> new_end =
-    HYPRE_THRUST_CALL(reduce_by_key, A_j_sorted, A_j_sorted + nnz_A,
-                      thrust::make_constant_iterator(1),
-                      reduced_col_indices,
-                      reduced_col_nnz);
-
-  hypre_assert(new_end.first - reduced_col_indices == new_end.second - reduced_col_nnz);
-
-  num_reduced_col_indices = new_end.first - reduced_col_indices;
-
-  hypre_Memset(colnnz, 0, ncols_A * sizeof(HYPRE_Real), HYPRE_MEMORY_DEVICE);
-  HYPRE_THRUST_CALL(scatter, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices,
-                    reduced_col_indices, colnnz);
-
-  hypre_TFree(A_j_sorted,          HYPRE_MEMORY_DEVICE);
-  hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE);
-  hypre_TFree(reduced_col_nnz,     HYPRE_MEMORY_DEVICE);
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return hypre_error_flag;
-}
-
-typedef thrust::tuple<HYPRE_Int, HYPRE_Int> Int2;
-struct Int2Unequal : public thrust::unary_function<Int2, bool>
-{
-  __host__ __device__
-    bool operator()(const Int2& t) const
-  {
-    return (thrust::get<0>(t) != thrust::get<1>(t));
-  }
-};
-
-HYPRE_Int
-hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A)
-{
-  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
-  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-  HYPRE_Int     *A_ii   = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
-  HYPRE_Int      new_nnz;
-  HYPRE_Int     *new_ii;
-  HYPRE_Int     *new_j;
-  HYPRE_Complex *new_data;
-
-  new_nnz = HYPRE_THRUST_CALL( count_if,
-                               thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
-                               thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz,
-                               Int2Unequal() );
-
-  if (new_nnz == nnz)
-  {
-    /* no diagonal entries found */
-    hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
-    return hypre_error_flag;
-  }
-
-  new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-  new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-
-  if (A_data)
-  {
-    new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
-
-    thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*, HYPRE_Complex*> > new_end;
-
-    new_end = HYPRE_THRUST_CALL( copy_if,
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
-                                 thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
-                                 Int2Unequal() );
-
-    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
-  }
-  else
-  {
-    new_data = NULL;
-
-    thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*> > new_end;
-
-    new_end = HYPRE_THRUST_CALL( copy_if,
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz,
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
-                                 thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j)),
-                                 Int2Unequal() );
-
-    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
-  }
-
-  hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
-  hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
-  hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
-  hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
-
-  hypre_CSRMatrixNumNonzeros(A) = new_nnz;
-  hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
-  hypre_CSRMatrixJ(A) = new_j;
-  hypre_CSRMatrixData(A) = new_data;
-  hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
-
-  return hypre_error_flag;
-}
-
-/* return C = [A; B] */
-hypre_CSRMatrix*
-hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B)
-{
-  hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) );
-
-  hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B),
-                                              hypre_CSRMatrixNumCols(A),
-                                              hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) );
-
-  HYPRE_Int     *C_i = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE);
-  HYPRE_Int     *C_j = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
-  HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
-
-  hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1,
-                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B),
-                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  HYPRE_THRUST_CALL( transform,
-                     C_i + hypre_CSRMatrixNumRows(A) + 1,
-                     C_i + hypre_CSRMatrixNumRows(C) + 1,
-                     thrust::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)),
-                     C_i + hypre_CSRMatrixNumRows(A) + 1,
-                     thrust::plus<HYPRE_Int>() );
-
-  hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A),
-                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B),
-                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-
-  hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A),
-                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B),
-                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-
-  hypre_CSRMatrixI(C) = C_i;
-  hypre_CSRMatrixJ(C) = C_j;
-  hypre_CSRMatrixData(C) = C_a;
-  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
-
-  return C;
-}
-
-/* A = alp * I */
-hypre_CSRMatrix *
-hypre_CSRMatrixIdentityDevice(HYPRE_Int n, HYPRE_Complex alp)
-{
-  hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n);
-
-  hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE);
-
-  HYPRE_THRUST_CALL( sequence,
-                     hypre_CSRMatrixI(A),
-                     hypre_CSRMatrixI(A) + n + 1,
-                     0  );
-
-  HYPRE_THRUST_CALL( sequence,
-                     hypre_CSRMatrixJ(A),
-                     hypre_CSRMatrixJ(A) + n,
-                     0  );
-
-  HYPRE_THRUST_CALL( fill,
-                     hypre_CSRMatrixData(A),
-                     hypre_CSRMatrixData(A) + n,
-                     alp );
-
-  return A;
-}
-
-/* this predicate compares first and second element in a tuple in absolute value */
-/* first is assumed to be complex, second to be real > 0 */
-struct cabsfirst_greaterthan_second_pred : public thrust::unary_function<thrust::tuple<HYPRE_Complex, HYPRE_Real>,bool>
-{
-  __host__ __device__
-    bool operator()(const thrust::tuple<HYPRE_Complex, HYPRE_Real>& t) const
-  {
-    const HYPRE_Complex i = thrust::get<0>(t);
-    const HYPRE_Real j = thrust::get<1>(t);
-
-    return hypre_cabs(i) > j;
-  }
-};
-
-/* drop the entries that are smaller than:
- *    tol if elmt_tols == null,
- *    elmt_tols[j] otherwise where j = 0...NumNonzeros(A) */
-HYPRE_Int
-hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A,
-                                       HYPRE_Real       tol,
-                                       HYPRE_Real      *elmt_tols)
-{
-  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
-  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-  HYPRE_Int     *A_ii   = NULL;
-  HYPRE_Int      new_nnz = 0;
-  HYPRE_Int     *new_ii;
-  HYPRE_Int     *new_j;
-  HYPRE_Complex *new_data;
-
-  if (elmt_tols == NULL)
-  {
-    new_nnz = HYPRE_THRUST_CALL( count_if,
-                                 A_data,
-                                 A_data + nnz,
-                                 thrust::not1(less_than<HYPRE_Complex>(tol)) );
-  }
-  else
-  {
-    new_nnz = HYPRE_THRUST_CALL( count_if,
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)),
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)) + nnz,
-                                 cabsfirst_greaterthan_second_pred() );
-  }
-
-  if (new_nnz == nnz)
-  {
-    hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
-    return hypre_error_flag;
-  }
-
-  if (!A_ii)
-  {
-    A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
-  }
-  new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-  new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-  new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
-
-  thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*, HYPRE_Complex*> > new_end;
+  num_cols_offd_C = new_end - col_map_offd_C;
 
-  if (elmt_tols == NULL)
-  {
-    new_end = HYPRE_THRUST_CALL( copy_if,
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
-                                 A_data,
-                                 thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
-                                 thrust::not1(less_than<HYPRE_Complex>(tol)) );
-  }
-  else
-  {
-    new_end = HYPRE_THRUST_CALL( copy_if,
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
-                                 thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)),
-                                 thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
-                                 cabsfirst_greaterthan_second_pred() );
+#if 1
+  HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE,
+                HYPRE_MEMORY_DEVICE);
+  hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE);
+  col_map_offd_C = tmp;
+#else
+  col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B,
+                                     HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
+#endif
+
+  /* create map from col_map_offd_B */
+  if (num_cols_offd_B) {
+    map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE);
+    HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
+                       col_map_offd_C,
+                       col_map_offd_C + num_cols_offd_C,
+                       col_map_offd_B,
+                       col_map_offd_B + num_cols_offd_B,
+                       map_B_to_C );
   }
 
-  hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
+  HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
+                     col_map_offd_C,
+                     col_map_offd_C + num_cols_offd_C,
+                     B_ext_offd_bigj,
+                     B_ext_offd_bigj + B_ext_offd_nnz,
+                     B_ext_offd_j );
 
-  hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
-  hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
-  hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
-  hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE);
 
-  hypre_CSRMatrixNumNonzeros(A) = new_nnz;
-  hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
-  hypre_CSRMatrixJ(A) = new_j;
-  hypre_CSRMatrixData(A) = new_data;
-  hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
+  if (map_B_to_C_ptr) {
+    *map_B_to_C_ptr   = map_B_to_C;
+  }
+  *num_cols_offd_C_ptr = num_cols_offd_C;
+  *col_map_offd_C_ptr  = col_map_offd_C;
 
   return hypre_error_flag;
 }
 
-#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */
+/* this predicate compares first and second element in a tuple in absolute value */
+/* first is assumed to be complex, second to be real > 0 */
+struct cabsfirst_greaterthan_second_pred
+{
+  bool operator()(const std::tuple<HYPRE_Complex, HYPRE_Real>& t) const
+    {
+      const HYPRE_Complex i = std::get<0>(t);
+      const HYPRE_Real j = std::get<1>(t);
+
+      return hypre_cabs(i) > j;
+    }
+};
+
+#endif /* HYPRE_USING_SYCL */
 
-#if defined(HYPRE_USING_SYCL)
+
+#if defined(HYPRE_USING_GPU)
 
 hypre_CSRMatrix*
 hypre_CSRMatrixAddDevice ( HYPRE_Complex    alpha,
@@ -1759,272 +632,98 @@ hypre_CSRMatrixSplitDevice( hypre_CSRMatrix  *B_ext,
   hypreDevice_CsrRowPtrsToIndices_v2(num_rows, B_ext_nnz, hypre_CSRMatrixI(B_ext), B_ext_ii);
 
   HYPRE_Int B_ext_diag_nnz;
-  HYPRE_Int B_ext_offd_nnz;
-  HYPRE_Int ierr;
-
-  ierr = hypre_CSRMatrixSplitDevice_core( 0,
-                                          num_rows,
-                                          B_ext_nnz,
-                                          NULL,
-                                          hypre_CSRMatrixBigJ(B_ext),
-                                          NULL,
-                                          NULL,
-                                          first_col_diag_B,
-                                          last_col_diag_B,
-                                          num_cols_offd_B,
-                                          NULL,
-                                          NULL,
-                                          NULL,
-                                          NULL,
-                                          &B_ext_diag_nnz,
-                                          NULL,
-                                          NULL,
-                                          NULL,
-                                          NULL,
-                                          &B_ext_offd_nnz,
-                                          NULL,
-                                          NULL,
-                                          NULL,
-                                          NULL );
-
-  HYPRE_Int     *B_ext_diag_ii = hypre_TAlloc(HYPRE_Int,     B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-  HYPRE_Int     *B_ext_diag_j  = hypre_TAlloc(HYPRE_Int,     B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-  HYPRE_Complex *B_ext_diag_a  = hypre_TAlloc(HYPRE_Complex, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-
-  HYPRE_Int     *B_ext_offd_ii = hypre_TAlloc(HYPRE_Int,     B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
-  HYPRE_Int     *B_ext_offd_j  = hypre_TAlloc(HYPRE_Int,     B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
-  HYPRE_Complex *B_ext_offd_a  = hypre_TAlloc(HYPRE_Complex, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
-
-  ierr = hypre_CSRMatrixSplitDevice_core( 1,
-                                          num_rows,
-                                          B_ext_nnz,
-                                          B_ext_ii,
-                                          hypre_CSRMatrixBigJ(B_ext),
-                                          hypre_CSRMatrixData(B_ext),
-                                          NULL,
-                                          first_col_diag_B,
-                                          last_col_diag_B,
-                                          num_cols_offd_B,
-                                          col_map_offd_B,
-                                          map_B_to_C_ptr,
-                                          num_cols_offd_C_ptr,
-                                          col_map_offd_C_ptr,
-                                          &B_ext_diag_nnz,
-                                          B_ext_diag_ii,
-                                          B_ext_diag_j,
-                                          B_ext_diag_a,
-                                          NULL,
-                                          &B_ext_offd_nnz,
-                                          B_ext_offd_ii,
-                                          B_ext_offd_j,
-                                          B_ext_offd_a,
-                                          NULL );
-
-  hypre_TFree(B_ext_ii, HYPRE_MEMORY_DEVICE);
-
-  /* convert to row ptrs */
-  HYPRE_Int *B_ext_diag_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_diag_nnz, B_ext_diag_ii);
-  HYPRE_Int *B_ext_offd_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_offd_nnz, B_ext_offd_ii);
-
-  hypre_TFree(B_ext_diag_ii, HYPRE_MEMORY_DEVICE);
-  hypre_TFree(B_ext_offd_ii, HYPRE_MEMORY_DEVICE);
-
-  /* create diag and offd CSR */
-  hypre_CSRMatrix *B_ext_diag = hypre_CSRMatrixCreate(num_rows, last_col_diag_B - first_col_diag_B + 1, B_ext_diag_nnz);
-  hypre_CSRMatrix *B_ext_offd = hypre_CSRMatrixCreate(num_rows, *num_cols_offd_C_ptr, B_ext_offd_nnz);
-
-  hypre_CSRMatrixI(B_ext_diag) = B_ext_diag_i;
-  hypre_CSRMatrixJ(B_ext_diag) = B_ext_diag_j;
-  hypre_CSRMatrixData(B_ext_diag) = B_ext_diag_a;
-  hypre_CSRMatrixNumNonzeros(B_ext_diag) = B_ext_diag_nnz;
-  hypre_CSRMatrixMemoryLocation(B_ext_diag) = HYPRE_MEMORY_DEVICE;
-
-  hypre_CSRMatrixI(B_ext_offd) = B_ext_offd_i;
-  hypre_CSRMatrixJ(B_ext_offd) = B_ext_offd_j;
-  hypre_CSRMatrixData(B_ext_offd) = B_ext_offd_a;
-  hypre_CSRMatrixNumNonzeros(B_ext_offd) = B_ext_offd_nnz;
-  hypre_CSRMatrixMemoryLocation(B_ext_offd) = HYPRE_MEMORY_DEVICE;
-
-  *B_ext_diag_ptr = B_ext_diag;
-  *B_ext_offd_ptr = B_ext_offd;
-
-  hypre_SyncDeviceComputeStream(hypre_handle());
-
-  return ierr;
-}
-
-HYPRE_Int
-hypre_CSRMatrixSplitDevice_core( HYPRE_Int         job,                 /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */
-                                 HYPRE_Int         num_rows,
-                                 HYPRE_Int         B_ext_nnz,
-                                 HYPRE_Int        *B_ext_ii,            /* Note: this is NOT row pointers as in CSR but row indices as in COO */
-                                 HYPRE_BigInt     *B_ext_bigj,          /* Note: [BigInt] global column indices */
-                                 HYPRE_Complex    *B_ext_data,
-                                 char             *B_ext_xata,          /* companion data with B_ext_data; NULL if none */
-                                 HYPRE_BigInt      first_col_diag_B,
-                                 HYPRE_BigInt      last_col_diag_B,
-                                 HYPRE_Int         num_cols_offd_B,
-                                 HYPRE_BigInt     *col_map_offd_B,
-                                 HYPRE_Int       **map_B_to_C_ptr,
-                                 HYPRE_Int        *num_cols_offd_C_ptr,
-                                 HYPRE_BigInt    **col_map_offd_C_ptr,
-                                 HYPRE_Int        *B_ext_diag_nnz_ptr,
-                                 HYPRE_Int        *B_ext_diag_ii,       /* memory allocated outside */
-                                 HYPRE_Int        *B_ext_diag_j,
-                                 HYPRE_Complex    *B_ext_diag_data,
-                                 char             *B_ext_diag_xata,     /* companion with B_ext_diag_data_ptr; NULL if none */
-                                 HYPRE_Int        *B_ext_offd_nnz_ptr,
-                                 HYPRE_Int        *B_ext_offd_ii,       /* memory allocated outside */
-                                 HYPRE_Int        *B_ext_offd_j,
-                                 HYPRE_Complex    *B_ext_offd_data,
-                                 char             *B_ext_offd_xata      /* companion with B_ext_offd_data_ptr; NULL if none */ )
-{
-  HYPRE_Int      B_ext_diag_nnz;
-  HYPRE_Int      B_ext_offd_nnz;
-  HYPRE_BigInt  *B_ext_diag_bigj = NULL;
-  HYPRE_BigInt  *B_ext_offd_bigj = NULL;
-  HYPRE_BigInt  *col_map_offd_C;
-  HYPRE_Int     *map_B_to_C = NULL;
-  HYPRE_Int      num_cols_offd_C;
-
-  in_range<HYPRE_BigInt> pred1(first_col_diag_B, last_col_diag_B);
-
-  /* get diag and offd nnz */
-  if (job == 0) {
-    /* query the nnz's */
-    B_ext_diag_nnz = HYPRE_ONEDPL_CALL( std::count_if,
-                                        B_ext_bigj,
-                                        B_ext_bigj + B_ext_nnz,
-                                        pred1 );
-    B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz;
-
-    *B_ext_diag_nnz_ptr = B_ext_diag_nnz;
-    *B_ext_offd_nnz_ptr = B_ext_offd_nnz;
-
-    return hypre_error_flag;
-  }
-  else {
-    B_ext_diag_nnz = *B_ext_diag_nnz_ptr;
-    B_ext_offd_nnz = *B_ext_offd_nnz_ptr;
-  }
-
-  /* copy to diag */
-  B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-
-  if (B_ext_diag_xata) {
-    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata);
-    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
-                                      first,                                                                                           /* first */
-                                      first + B_ext_nnz,                                                                               /* last */
-                                      B_ext_bigj,                                                                                      /* stencil */
-                                      oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata),/* result */
-                                      pred1 );
-
-    //hypre_assert( std::get<0>(new_end.get_iterator_tuple() == B_ext_diag_ii + B_ext_diag_nnz );
-  }
-  else {
-    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data);
-    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
-                                      first,                                                                             /* first */
-                                      first + B_ext_nnz,                                                                 /* last */
-                                      B_ext_bigj,                                                                        /* stencil */
-                                      oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data),   /* result */
-                                      pred1 );
-
-    //hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz );
-  }
-
-  HYPRE_BigInt *const_iterator = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-  hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, first_col_diag_B, B_ext_diag_nnz*sizeof(HYPRE_BigInt)).wait();
-  HYPRE_ONEDPL_CALL( std::transform,
-                     B_ext_diag_bigj,
-                     B_ext_diag_bigj + B_ext_diag_nnz,
-                     const_iterator, //dpct::make_constant_iterator(first_col_diag_B),
-                     B_ext_diag_j,
-                     std::minus<HYPRE_BigInt>() );
-  hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE);
-
-  hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE);
-
-  /* copy to offd */
-  B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
-
-  if (B_ext_offd_xata) {
-    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata);
-    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
-                                      first,                                                                                            /* first */
-                                      first + B_ext_nnz,                                                                                /* last */
-                                      B_ext_bigj,                                                                                       /* stencil */
-                                      oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata), /* result */
-                                      std::not_fn(pred1) );
+  HYPRE_Int B_ext_offd_nnz;
+  HYPRE_Int ierr;
 
-    // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
-  }
-  else {
-    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data);
-    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
-                                      first,                                                                           /* first */
-                                      first + B_ext_nnz,                                                               /* last */
-                                      B_ext_bigj,                                                                      /* stencil */
-                                      oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data), /* result */
-                                      std::not_fn(pred1) );
+  ierr = hypre_CSRMatrixSplitDevice_core( 0,
+                                          num_rows,
+                                          B_ext_nnz,
+                                          NULL,
+                                          hypre_CSRMatrixBigJ(B_ext),
+                                          NULL,
+                                          NULL,
+                                          first_col_diag_B,
+                                          last_col_diag_B,
+                                          num_cols_offd_B,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          &B_ext_diag_nnz,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          &B_ext_offd_nnz,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          NULL );
 
-    // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
-  }
+  HYPRE_Int     *B_ext_diag_ii = hypre_TAlloc(HYPRE_Int,     B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int     *B_ext_diag_j  = hypre_TAlloc(HYPRE_Int,     B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_Complex *B_ext_diag_a  = hypre_TAlloc(HYPRE_Complex, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
 
-  /* offd map of B_ext_offd Union col_map_offd_B */
-  col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(col_map_offd_C,                  B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz,  HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B,  HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int     *B_ext_offd_ii = hypre_TAlloc(HYPRE_Int,     B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int     *B_ext_offd_j  = hypre_TAlloc(HYPRE_Int,     B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_Complex *B_ext_offd_a  = hypre_TAlloc(HYPRE_Complex, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
 
-  HYPRE_ONEDPL_CALL( std::sort,
-                     col_map_offd_C,
-                     col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
+  ierr = hypre_CSRMatrixSplitDevice_core( 1,
+                                          num_rows,
+                                          B_ext_nnz,
+                                          B_ext_ii,
+                                          hypre_CSRMatrixBigJ(B_ext),
+                                          hypre_CSRMatrixData(B_ext),
+                                          NULL,
+                                          first_col_diag_B,
+                                          last_col_diag_B,
+                                          num_cols_offd_B,
+                                          col_map_offd_B,
+                                          map_B_to_C_ptr,
+                                          num_cols_offd_C_ptr,
+                                          col_map_offd_C_ptr,
+                                          &B_ext_diag_nnz,
+                                          B_ext_diag_ii,
+                                          B_ext_diag_j,
+                                          B_ext_diag_a,
+                                          NULL,
+                                          &B_ext_offd_nnz,
+                                          B_ext_offd_ii,
+                                          B_ext_offd_j,
+                                          B_ext_offd_a,
+                                          NULL );
 
-  HYPRE_BigInt *new_end = HYPRE_ONEDPL_CALL( std::unique,
-                                             col_map_offd_C,
-                                             col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
+  hypre_TFree(B_ext_ii, HYPRE_MEMORY_DEVICE);
 
-  num_cols_offd_C = new_end - col_map_offd_C;
+  /* convert to row ptrs */
+  HYPRE_Int *B_ext_diag_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_diag_nnz, B_ext_diag_ii);
+  HYPRE_Int *B_ext_offd_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_offd_nnz, B_ext_offd_ii);
 
-#if 1
-  HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE,
-                HYPRE_MEMORY_DEVICE);
-  hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE);
-  col_map_offd_C = tmp;
-#else
-  col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B,
-                                     HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
-#endif
+  hypre_TFree(B_ext_diag_ii, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(B_ext_offd_ii, HYPRE_MEMORY_DEVICE);
 
-  /* create map from col_map_offd_B */
-  if (num_cols_offd_B) {
-    map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE);
-    HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
-                       col_map_offd_C,
-                       col_map_offd_C + num_cols_offd_C,
-                       col_map_offd_B,
-                       col_map_offd_B + num_cols_offd_B,
-                       map_B_to_C );
-  }
+  /* create diag and offd CSR */
+  hypre_CSRMatrix *B_ext_diag = hypre_CSRMatrixCreate(num_rows, last_col_diag_B - first_col_diag_B + 1, B_ext_diag_nnz);
+  hypre_CSRMatrix *B_ext_offd = hypre_CSRMatrixCreate(num_rows, *num_cols_offd_C_ptr, B_ext_offd_nnz);
 
-  HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
-                     col_map_offd_C,
-                     col_map_offd_C + num_cols_offd_C,
-                     B_ext_offd_bigj,
-                     B_ext_offd_bigj + B_ext_offd_nnz,
-                     B_ext_offd_j );
+  hypre_CSRMatrixI(B_ext_diag) = B_ext_diag_i;
+  hypre_CSRMatrixJ(B_ext_diag) = B_ext_diag_j;
+  hypre_CSRMatrixData(B_ext_diag) = B_ext_diag_a;
+  hypre_CSRMatrixNumNonzeros(B_ext_diag) = B_ext_diag_nnz;
+  hypre_CSRMatrixMemoryLocation(B_ext_diag) = HYPRE_MEMORY_DEVICE;
 
-  hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE);
+  hypre_CSRMatrixI(B_ext_offd) = B_ext_offd_i;
+  hypre_CSRMatrixJ(B_ext_offd) = B_ext_offd_j;
+  hypre_CSRMatrixData(B_ext_offd) = B_ext_offd_a;
+  hypre_CSRMatrixNumNonzeros(B_ext_offd) = B_ext_offd_nnz;
+  hypre_CSRMatrixMemoryLocation(B_ext_offd) = HYPRE_MEMORY_DEVICE;
 
-  if (map_B_to_C_ptr) {
-    *map_B_to_C_ptr   = map_B_to_C;
-  }
-  *num_cols_offd_C_ptr = num_cols_offd_C;
-  *col_map_offd_C_ptr  = col_map_offd_C;
+  *B_ext_diag_ptr = B_ext_diag;
+  *B_ext_offd_ptr = B_ext_offd;
 
-  return hypre_error_flag;
+  hypre_SyncDeviceComputeStream(hypre_handle());
+
+  return ierr;
 }
 
 /*--------------------------------------------------------------------------
@@ -2062,7 +761,7 @@ hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A,
 
   if (ncols_A != ncols_B)
   {
-    hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! incompatible matrix dimensions!\n");
+    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! Incompatible matrix dimensions!\n");
 
     return NULL;
   }
@@ -2092,15 +791,15 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix  *A,
   HYPRE_Int  num_reduced_col_indices;
   HYPRE_Int *reduced_col_indices;
   HYPRE_Int *reduced_col_nnz;
+  reduced_col_indices = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
+  reduced_col_nnz     = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
 
   A_j_sorted = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE);
   hypre_TMemcpy(A_j_sorted, A_j, HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  HYPRE_ONEDPL_CALL(std::sort, A_j_sorted, A_j_sorted + nnz_A);
 
-  reduced_col_indices = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
-  reduced_col_nnz     = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
+#ifdef HYPRE_USING_SYCL
+  HYPRE_ONEDPL_CALL(std::sort, A_j_sorted, A_j_sorted + nnz_A);
 
-  // ABB: Replace values in-place with dpct::make_constant_iterator(1)
   HYPRE_Int* values = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_UNIFIED);
   hypre_HandleComputeStream(hypre_handle())->fill(values, 1, nnz_A*sizeof(HYPRE_Int)).wait();
   std::pair<HYPRE_Int*, HYPRE_Int*> new_end =
@@ -2109,263 +808,534 @@ hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix  *A,
                        reduced_col_indices,
                        reduced_col_nnz );
 
+  hypre_TFree(values,              HYPRE_MEMORY_UNIFIED);
+#else
+  HYPRE_THRUST_CALL(sort, A_j_sorted, A_j_sorted + nnz_A);
+
+  thrust::pair<HYPRE_Int*, HYPRE_Int*> new_end =
+    HYPRE_THRUST_CALL(reduce_by_key, A_j_sorted, A_j_sorted + nnz_A,
+                      thrust::make_constant_iterator(1),
+                      reduced_col_indices,
+                      reduced_col_nnz);
+#endif
+
   hypre_assert(new_end.first - reduced_col_indices == new_end.second - reduced_col_nnz);
 
   num_reduced_col_indices = new_end.first - reduced_col_indices;
 
   hypre_Memset(colnnz, 0, ncols_A * sizeof(HYPRE_Real), HYPRE_MEMORY_DEVICE);
+#ifdef HYPRE_USING_SYCL
   HYPRE_ONEDPL_CALL( oneapi::dpl::copy, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices,
                      oneapi::dpl::make_permutation_iterator(colnnz, reduced_col_indices) );
+#else
+  HYPRE_THRUST_CALL(scatter, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices,
+                    reduced_col_indices, colnnz);
+#endif
 
   hypre_TFree(A_j_sorted,          HYPRE_MEMORY_DEVICE);
   hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE);
   hypre_TFree(reduced_col_nnz,     HYPRE_MEMORY_DEVICE);
-  hypre_TFree(values,              HYPRE_MEMORY_UNIFIED);
 
   hypre_SyncDeviceComputeStream(hypre_handle());
 
   return hypre_error_flag;
 }
 
-HYPRE_Int
-hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A)
+__global__ void
+hypreGPUKernel_CSRMoveDiagFirst(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+  #endif
+  HYPRE_Int      nrows,
+  HYPRE_Int     *ia,
+  HYPRE_Int     *ja,
+  HYPRE_Complex *aa )
 {
-  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
-  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-  HYPRE_Int     *A_ii   = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
-  HYPRE_Int      new_nnz;
-  HYPRE_Int     *new_ii;
-  HYPRE_Int     *new_j;
-  HYPRE_Complex *new_data;
+#ifdef HYPRE_USING_SYCL
+  HYPRE_Int row  = hypre_gpu_get_grid_warp_id<1,1>(item);
+  HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item);
+  sycl::sub_group SG = item.get_sub_group();
+#else
+  HYPRE_Int row  = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
+#endif
 
-  auto zipped_begin = oneapi::dpl::make_zip_iterator(A_ii, A_j);
-  new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
-                               zipped_begin, zipped_begin + nnz,
-                               [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
+  if (row >= nrows)
+  {
+    return;
+  }
 
-  if (new_nnz == nnz)
+  HYPRE_Int p = 0, q = 0;
+
+  if (lane < 2)
   {
-    /* no diagonal entries found */
-    hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
-    return hypre_error_flag;
+    p = read_only_load(ia + row + lane);
   }
+#ifdef HYPRE_USING_SYCL
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
 
-  new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-  new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+  for (HYPRE_Int j = p + lane + 1; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0))
+#else
+  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
 
-  if (A_data)
+  for (HYPRE_Int j = p + lane + 1; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+#endif
   {
-    new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
+    hypre_int find_diag = j < q && ja[j] == row;
 
-    auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data);
-    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
-                                      first, first + nnz,
-                                      oneapi::dpl::make_zip_iterator(A_ii, A_j),
-                                      oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data),
-                                      [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
+    if (find_diag)
+    {
+      ja[j] = ja[p];
+      ja[p] = row;
+      HYPRE_Complex tmp = aa[p];
+      aa[p] = aa[j];
+      aa[j] = tmp;
+    }
 
-    // todo: fix this
-    // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz );
+#ifdef HYPRE_USING_SYCL
+    if ( sycl::any_of_group(SG, find_diag) )
+#else
+    if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+#endif
+    {
+      break;
+    }
   }
-  else
+}
+
+/* check if diagonal entry is the first one at each row
+ * Return: the number of rows that do not have the first entry as diagonal
+ * RL: only check if it's a non-empty row
+ */
+__global__ void
+hypreGPUKernel_CSRCheckDiagFirst(
+#ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+#endif
+  HYPRE_Int  nrows,
+  HYPRE_Int *ia,
+  HYPRE_Int *ja,
+  HYPRE_Int *result )
+{
+#ifdef HYPRE_USING_SYCL
+  const HYPRE_Int row = hypre_gpu_get_grid_thread_id<1,1>(item);
+#else
+  const HYPRE_Int row = hypre_cuda_get_grid_thread_id<1,1>();
+#endif
+  if (row < nrows)
+  {
+    result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row);
+  }
+}
+
+__global__ void
+hypreGPUKernel_CSRMatrixFixZeroDiagDevice(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+  #endif
+  HYPRE_Complex  v,
+  HYPRE_Int      nrows,
+  HYPRE_Int     *ia,
+  HYPRE_Int     *ja,
+  HYPRE_Complex *data,
+  HYPRE_Real     tol,
+  HYPRE_Int     *result )
+{
+#ifdef HYPRE_USING_SYCL
+  const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item);
+  HYPRE_Int lane      = hypre_gpu_get_lane_id<1>(item);
+  sycl::sub_group SG  = item.get_sub_group();
+#else
+  const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_Int lane      = hypre_cuda_get_lane_id<1>();
+#endif
+
+  if (row >= nrows)
   {
-    new_data = NULL;
+    return;
+  }
 
-    auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j);
-    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
-                                      first, first + nnz,
-                                      first,
-                                      oneapi::dpl::make_zip_iterator(new_ii, new_j),
-                                      [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
+  HYPRE_Int p = 0, q = 0;
+  bool has_diag = false;
 
-    // todo: fix this
-    // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz );
+  if (lane < 2)
+  {
+    p = read_only_load(ia + row + lane);
   }
 
-  hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
-  hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
-  hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
-  hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
+#ifdef HYPRE_USING_SYCL
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
 
-  hypre_CSRMatrixNumNonzeros(A) = new_nnz;
-  hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
-  hypre_CSRMatrixJ(A) = new_j;
-  hypre_CSRMatrixData(A) = new_data;
-  hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
+  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0))
+#else
+  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
 
-  return hypre_error_flag;
-}
+  for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+#endif
+  {
+    hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
 
-/* return C = [A; B] */
-hypre_CSRMatrix*
-hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B)
-{
-  hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) );
+    if (find_diag)
+    {
+      if (fabs(data[j]) <= tol)
+      {
+        data[j] = v;
+      }
+    }
 
-  hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B),
-                                              hypre_CSRMatrixNumCols(A),
-                                              hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) );
+#ifdef HYPRE_USING_SYCL
+    if ( sycl::any_of_group(SG, find_diag) )
+#else
+    if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+#endif
+    {
+      has_diag = true;
+      break;
+    }
+  }
 
-  HYPRE_Int     *C_i = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE);
-  HYPRE_Int     *C_j = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
-  HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
+  if (result && !has_diag && lane == 0)
+  {
+    result[row] = 1;
+  }
+}
 
-  hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1,
-                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B),
-                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+__global__ void
+hypreGPUKernel_CSRMatrixReplaceDiagDevice(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+  #endif
+  HYPRE_Complex *new_diag,
+  HYPRE_Complex  v,
+  HYPRE_Int      nrows,
+  HYPRE_Int     *ia,
+  HYPRE_Int     *ja,
+  HYPRE_Complex *data,
+  HYPRE_Real     tol,
+  HYPRE_Int     *result )
+{
+#ifdef HYPRE_USING_SYCL
+  const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item);
+  HYPRE_Int lane      = hypre_gpu_get_lane_id<1>(item);
+  sycl::sub_group SG  = item.get_sub_group();
+#else
+  const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_Int lane      = hypre_cuda_get_lane_id<1>();
+#endif
+  if (row >= nrows)
+  {
+    return;
+  }
 
-  HYPRE_Int *const_iterator = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE);
-  hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, hypre_CSRMatrixNumNonzeros(A), (hypre_CSRMatrixNumRows(C) + 1)*sizeof(HYPRE_Int)).wait();
-  HYPRE_ONEDPL_CALL( std::transform,
-                     C_i + hypre_CSRMatrixNumRows(A) + 1,
-                     C_i + hypre_CSRMatrixNumRows(C) + 1,
-                     const_iterator, //dpct::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)),
-                     C_i + hypre_CSRMatrixNumRows(A) + 1,
-                     std::plus<HYPRE_Int>() );
-  hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int p = 0, q = 0;
+  bool has_diag = false;
 
+  if (lane < 2)
+  {
+    p = read_only_load(ia + row + lane);
+  }
+#ifdef HYPRE_USING_SYCL
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
 
-  hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A),
-                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B),
-                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0))
+#else
+  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
 
-  hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A),
-                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-  hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B),
-                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+#endif
+  {
+    hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
 
-  hypre_CSRMatrixI(C) = C_i;
-  hypre_CSRMatrixJ(C) = C_j;
-  hypre_CSRMatrixData(C) = C_a;
-  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
+    if (find_diag)
+    {
+      HYPRE_Complex d = read_only_load(&new_diag[row]);
+      if (fabs(d) <= tol)
+      {
+        d = v;
+      }
+      data[j] = d;
+    }
 
-  return C;
+#ifdef HYPRE_USING_SYCL
+    if ( sycl::any_of_group(SG, find_diag) )
+#else
+    if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+#endif
+    {
+      has_diag = true;
+      break;
+    }
+  }
+
+  if (result && !has_diag && lane == 0)
+  {
+    result[row] = 1;
+  }
 }
 
-/* A = alp * I */
-hypre_CSRMatrix *
-hypre_CSRMatrixIdentityDevice(HYPRE_Int n, HYPRE_Complex alp)
+/* type == 0, sum,
+ *         1, abs sum (l-1)
+ *         2, square sum (l-2)
+ */
+template<HYPRE_Int type>
+__global__ void
+hypreGPUKernel_CSRRowSum(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+  #endif
+  HYPRE_Int      nrows,
+  HYPRE_Int     *ia,
+  HYPRE_Int     *ja,
+  HYPRE_Complex *aa,
+  HYPRE_Int     *CF_i,
+  HYPRE_Int     *CF_j,
+  HYPRE_Complex *row_sum,
+  HYPRE_Complex  scal,
+  HYPRE_Int      set)
 {
-  hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n);
+#ifdef HYPRE_USING_SYCL
+  HYPRE_Int row_i    = hypre_gpu_get_grid_warp_id<1,1>(item);
+  HYPRE_Int lane     = hypre_gpu_get_lane_id<1>(item);
+  sycl::sub_group SG = item.get_sub_group();
+#else
+  HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_Int lane  = hypre_cuda_get_lane_id<1>();
+#endif
+  if (row_i >= nrows)
+  {
+    return;
+  }
 
-  hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int p = 0, q = 0;
 
-  HYPRE_ONEDPL_CALL( dpct::iota,
-                     hypre_CSRMatrixI(A),
-                     hypre_CSRMatrixI(A) + n + 1,
-                     0  );
+  if (lane < 2)
+  {
+    p = read_only_load(ia + row_i + lane);
+  }
 
-  HYPRE_ONEDPL_CALL( dpct::iota,
-                     hypre_CSRMatrixJ(A),
-                     hypre_CSRMatrixJ(A) + n,
-                     0  );
+  HYPRE_Complex row_sum_i = 0.0;
 
-  HYPRE_ONEDPL_CALL( std::fill,
-                     hypre_CSRMatrixData(A),
-                     hypre_CSRMatrixData(A) + n,
-                     alp );
+#ifdef HYPRE_USING_SYCL
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
 
-  return A;
-}
+  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0))
+#else
+  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
 
-/* this predicate compares first and second element in a tuple in absolute value */
-/* first is assumed to be complex, second to be real > 0 */
-struct cabsfirst_greaterthan_second_pred
-{
-  bool operator()(const std::tuple<HYPRE_Complex, HYPRE_Real>& t) const
+  for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+#endif
+  {
+    if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) )
     {
-      const HYPRE_Complex i = std::get<0>(t);
-      const HYPRE_Real j = std::get<1>(t);
+      continue;
+    }
 
-      return hypre_cabs(i) > j;
+    HYPRE_Complex aii = aa[j];
+
+    if (type == 0)
+    {
+      row_sum_i += aii;
     }
-};
+    else if (type == 1)
+    {
+      row_sum_i += fabs(aii);
+    }
+    else if (type == 2)
+    {
+      row_sum_i += aii * aii;
+    }
+  }
+#ifdef HYPRE_USING_SYCL
+  row_sum_i = warp_reduce_sum(row_sum_i, item);
+#else
+  row_sum_i = warp_reduce_sum(row_sum_i);
+#endif
+  if (lane == 0)
+  {
+    if (set)
+    {
+      row_sum[row_i] = scal * row_sum_i;
+    }
+    else
+    {
+      row_sum[row_i] += scal * row_sum_i;
+    }
+  }
+}
 
-/* drop the entries that are smaller than:
- *    tol if elmt_tols == null,
- *    elmt_tols[j] otherwise where j = 0...NumNonzeros(A) */
-HYPRE_Int
-hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A,
-                                       HYPRE_Real       tol,
-                                       HYPRE_Real      *elmt_tols)
+/* type 0: diag
+ *      1: abs diag
+ *      2: diag inverse
+ *      3: diag inverse sqrt
+ *      4: abs diag inverse sqrt
+ */
+__global__ void
+hypreGPUKernel_CSRExtractDiag(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+  #endif
+  HYPRE_Int      nrows,
+  HYPRE_Int     *ia,
+  HYPRE_Int     *ja,
+  HYPRE_Complex *aa,
+  HYPRE_Complex *d,
+  HYPRE_Int      type)
 {
-  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-  HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
-  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-  HYPRE_Int     *A_ii   = NULL;
-  HYPRE_Int      new_nnz = 0;
-  HYPRE_Int     *new_ii;
-  HYPRE_Int     *new_j;
-  HYPRE_Complex *new_data;
+#ifdef HYPRE_USING_SYCL
+  HYPRE_Int row      = hypre_gpu_get_grid_warp_id<1,1>(item);
+  HYPRE_Int lane     = hypre_gpu_get_lane_id<1>(item);
+  sycl::sub_group SG = item.get_sub_group();
+#else
+  HYPRE_Int row      = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_Int lane     = hypre_cuda_get_lane_id<1>();
+#endif
+  if (row >= nrows)
+  {
+    return;
+  }
 
-  if (elmt_tols == NULL)
+  HYPRE_Int p = 0, q = 0;
+
+  if (lane < 2)
   {
-    new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
-                                 A_data,
-                                 A_data + nnz,
-                                 std::not_fn(less_than<HYPRE_Complex>(tol)) );
+    p = read_only_load(ia + row + lane);
   }
-  else
+  HYPRE_Int has_diag = 0;
+#ifdef HYPRE_USING_SYCL
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
+
+  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0))
+#else
+  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+
+  for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+#endif
   {
-    auto first = oneapi::dpl::make_zip_iterator(A_data, elmt_tols);
-    new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
-                                 first,
-                                 first + nnz,
-                                 cabsfirst_greaterthan_second_pred() );
+    hypre_int find_diag = j < q && ja[j] == row;
+
+    if (find_diag)
+    {
+      if (type == 0)
+      {
+        d[row] = aa[j];
+      }
+      else if (type == 1)
+      {
+        d[row] = fabs(aa[j]);
+      }
+      else if (type == 2)
+      {
+        d[row] = 1.0 / aa[j];
+      }
+      else if (type == 3)
+      {
+        d[row] = 1.0 / sqrt(aa[j]);
+      }
+      else if (type == 4)
+      {
+        d[row] = 1.0 / sqrt(fabs(aa[j]));
+      }
+    }
+
+#ifdef HYPRE_USING_SYCL
+    if ( sycl::any_of_group(SG, find_diag) )
+#else
+    if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+#endif
+    {
+      has_diag = 1;
+      break;
+    }
   }
 
-  if (new_nnz == nnz)
+  if (!has_diag && lane == 0)
   {
-    hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
-    return hypre_error_flag;
+    d[row] = 0.0;
   }
+}
 
-  if (!A_ii)
+/* mark is of size nA
+ * diag_option: 1: special treatment for diag entries, mark as -2
+ */
+__global__ void
+hypreGPUKernel_CSRMatrixIntersectPattern(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+  #endif
+  HYPRE_Int  n,
+  HYPRE_Int  nA,
+  HYPRE_Int *rowid,
+  HYPRE_Int *colid,
+  HYPRE_Int *idx,
+  HYPRE_Int *mark,
+  HYPRE_Int  diag_option)
+{
+#ifdef HYPRE_USING_SYCL
+  HYPRE_Int i = hypre_gpu_get_grid_thread_id<1,1>(item);
+#else
+  HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
+#endif
+
+  if (i >= n)
   {
-    A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
+    return;
   }
-  new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-  new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-  new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
 
-  oneapi::dpl::zip_iterator< HYPRE_Int*, HYPRE_Int*, HYPRE_Complex* > new_end;
+  HYPRE_Int r1 = read_only_load(&rowid[i]);
+  HYPRE_Int c1 = read_only_load(&colid[i]);
+  HYPRE_Int j = read_only_load(&idx[i]);
 
-  if (elmt_tols == NULL)
+  if (0 == diag_option)
   {
-    auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data);
-    new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
-                                 first, first + nnz,
-                                 A_data,
-                                 oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data),
-                                 std::not_fn(less_than<HYPRE_Complex>(tol)) );
+    if (j < nA)
+    {
+      HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
+      HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
+      if (r1 == r2 && c1 == c2)
+      {
+        mark[j] = c1;
+      }
+      else
+      {
+        mark[j] = -1;
+      }
+    }
   }
-  else
+  else if (1 == diag_option)
   {
-    auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data);
-    new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
-                                 first, first + nnz,
-                                 oneapi::dpl::make_zip_iterator(A_data, elmt_tols),
-                                 oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data),
-                                 cabsfirst_greaterthan_second_pred() );
+    if (j < nA)
+    {
+      if (r1 == c1)
+      {
+        mark[j] = -2;
+      }
+      else
+      {
+        HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
+        HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
+        if (r1 == r2 && c1 == c2)
+        {
+          mark[j] = c1;
+        }
+        else
+        {
+          mark[j] = -1;
+        }
+      }
+    }
   }
-
-  // todo: fix this
-  // hypre_assert( thrust::get<0>(*new_end) == new_ii + new_nnz );
-  return hypre_error_flag;
 }
 
-#endif /* HYPRE_USING_SYCL */
-
-
-#if defined(HYPRE_USING_GPU)
-
 /* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v
  * Does NOT assume diagonal is the first entry of each row of A
  * In debug mode:
@@ -2432,33 +1402,133 @@ hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
 
-#if HYPRE_DEBUG
-  HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
-#else
-  HYPRE_Int *result = NULL;
+#if HYPRE_DEBUG
+  HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+#else
+  HYPRE_Int *result = NULL;
+#endif
+
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
+                    new_diag, v, hypre_CSRMatrixNumRows(A),
+                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
+                    tol, result );
+
+#if HYPRE_DEBUG
+#if defined(HYPRE_USING_CUDA)
+  ierr = HYPRE_THRUST_CALL( reduce,
+                            result,
+                            result + hypre_CSRMatrixNumRows(A) );
+#elif defined(HYPRE_USING_SYCL)
+  ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
+                            result,
+                            result + hypre_CSRMatrixNumRows(A) );
+#endif
+  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+#endif // HYPRE_DEBUG
+
+  hypre_SyncDeviceComputeStream(hypre_handle());
+
+  return ierr;
+}
+
+HYPRE_Int
+hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A)
+{
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_ii   = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
+  HYPRE_Int      new_nnz;
+  HYPRE_Int     *new_ii;
+  HYPRE_Int     *new_j;
+  HYPRE_Complex *new_data;
+
+#ifdef HYPRE_USING_SYCL
+  auto zipped_begin = oneapi::dpl::make_zip_iterator(A_ii, A_j);
+  new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
+                               zipped_begin, zipped_begin + nnz,
+                               [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
+#else
+  new_nnz = HYPRE_THRUST_CALL( count_if,
+                               thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
+                               thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz,
+                               Int2Unequal() );
+#endif
+
+  if (new_nnz == nnz)
+  {
+    /* no diagonal entries found */
+    hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
+    return hypre_error_flag;
+  }
+
+  new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+  new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+
+  if (A_data)
+  {
+    new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
+
+#ifdef HYPRE_USING_SYCL
+    auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first, first + nnz,
+                                      oneapi::dpl::make_zip_iterator(A_ii, A_j),
+                                      oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data),
+                                      [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
+    // todo: fix this
+    // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz );
+#else
+    thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*, HYPRE_Complex*> > new_end;
+    new_end = HYPRE_THRUST_CALL( copy_if,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
+                                 Int2Unequal() );
+
+    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
+#endif
+  }
+  else
+  {
+    new_data = NULL;
+#ifdef HYPRE_USING_SYCL
+    auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first, first + nnz,
+                                      first,
+                                      oneapi::dpl::make_zip_iterator(new_ii, new_j),
+                                      [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
+    // TODO: abb fix this
+    // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz );
+#else
+    thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*> > new_end;
+    new_end = HYPRE_THRUST_CALL( copy_if,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j)),
+                                 Int2Unequal() );
+
+    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
 #endif
+  }
 
-  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
-                    new_diag, v, hypre_CSRMatrixNumRows(A),
-                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
-                    tol, result );
-
-#if HYPRE_DEBUG
-#if defined(HYPRE_USING_CUDA)
-  ierr = HYPRE_THRUST_CALL( reduce,
-                            result,
-                            result + hypre_CSRMatrixNumRows(A) );
-#elif defined(HYPRE_USING_SYCL)
-  ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
-                            result,
-                            result + hypre_CSRMatrixNumRows(A) );
-#endif
-  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
-#endif // HYPRE_DEBUG
+  hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
 
-  hypre_SyncDeviceComputeStream(hypre_handle());
+  hypre_CSRMatrixNumNonzeros(A) = new_nnz;
+  hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
+  hypre_CSRMatrixJ(A) = new_j;
+  hypre_CSRMatrixData(A) = new_data;
+  hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
 
-  return ierr;
+  return hypre_error_flag;
 }
 
 HYPRE_Int
@@ -2583,6 +1653,232 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
   hypre_SyncDeviceComputeStream(hypre_handle());
 }
 
+/* return C = [A; B] */
+hypre_CSRMatrix*
+hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B)
+{
+  hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) );
+
+  hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B),
+                                              hypre_CSRMatrixNumCols(A),
+                                              hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) );
+
+  HYPRE_Int     *C_i = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int     *C_j = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
+  HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
+
+  hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1,
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+#ifdef HYPRE_USING_SYCL
+  HYPRE_Int *const_iterator = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE);
+  hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, hypre_CSRMatrixNumNonzeros(A), (hypre_CSRMatrixNumRows(C) + 1)*sizeof(HYPRE_Int)).wait();
+
+  HYPRE_ONEDPL_CALL( std::transform,
+                     C_i + hypre_CSRMatrixNumRows(A) + 1,
+                     C_i + hypre_CSRMatrixNumRows(C) + 1,
+                     const_iterator, //dpct::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)),
+                     C_i + hypre_CSRMatrixNumRows(A) + 1,
+                     std::plus<HYPRE_Int>() );
+
+  hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE);
+#else
+  HYPRE_THRUST_CALL( transform,
+                     C_i + hypre_CSRMatrixNumRows(A) + 1,
+                     C_i + hypre_CSRMatrixNumRows(C) + 1,
+                     thrust::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)),
+                     C_i + hypre_CSRMatrixNumRows(A) + 1,
+                     thrust::plus<HYPRE_Int>() );
+#endif
+
+  hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+  hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+  hypre_CSRMatrixI(C) = C_i;
+  hypre_CSRMatrixJ(C) = C_j;
+  hypre_CSRMatrixData(C) = C_a;
+  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
+
+  return C;
+}
+
+/* A = alp * I */
+hypre_CSRMatrix *
+hypre_CSRMatrixIdentityDevice(HYPRE_Int n, HYPRE_Complex alp)
+{
+  hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n);
+
+  hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE);
+
+#ifdef HYPRE_USING_SYCL
+  HYPRE_ONEDPL_CALL( dpct::iota,
+                     hypre_CSRMatrixI(A),
+                     hypre_CSRMatrixI(A) + n + 1,
+                     0  );
+
+  HYPRE_ONEDPL_CALL( dpct::iota,
+                     hypre_CSRMatrixJ(A),
+                     hypre_CSRMatrixJ(A) + n,
+                     0  );
+
+  HYPRE_ONEDPL_CALL( std::fill,
+                     hypre_CSRMatrixData(A),
+                     hypre_CSRMatrixData(A) + n,
+                     alp );
+#else
+  HYPRE_THRUST_CALL( sequence,
+                     hypre_CSRMatrixI(A),
+                     hypre_CSRMatrixI(A) + n + 1,
+                     0  );
+
+  HYPRE_THRUST_CALL( sequence,
+                     hypre_CSRMatrixJ(A),
+                     hypre_CSRMatrixJ(A) + n,
+                     0  );
+
+  HYPRE_THRUST_CALL( fill,
+                     hypre_CSRMatrixData(A),
+                     hypre_CSRMatrixData(A) + n,
+                     alp );
+#endif
+  return A;
+}
+
+
+/* drop the entries that are smaller than:
+ *    tol if elmt_tols == null,
+ *    elmt_tols[j] otherwise where j = 0...NumNonzeros(A) */
+HYPRE_Int
+hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A,
+                                       HYPRE_Real       tol,
+                                       HYPRE_Real      *elmt_tols)
+{
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_ii   = NULL;
+  HYPRE_Int      new_nnz = 0;
+  HYPRE_Int     *new_ii;
+  HYPRE_Int     *new_j;
+  HYPRE_Complex *new_data;
+
+  if (elmt_tols == NULL)
+  {
+#ifdef HYPRE_USING_SYCL
+    new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
+                                 A_data,
+                                 A_data + nnz,
+                                 std::not_fn(less_than<HYPRE_Complex>(tol)) );
+#else
+    new_nnz = HYPRE_THRUST_CALL( count_if,
+                                 A_data,
+                                 A_data + nnz,
+                                 thrust::not1(less_than<HYPRE_Complex>(tol)) );
+#endif
+  }
+  else
+  {
+#ifdef HYPRE_USING_SYCL
+    auto first = oneapi::dpl::make_zip_iterator(A_data, elmt_tols);
+    new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
+                                 first,
+                                 first + nnz,
+                                 cabsfirst_greaterthan_second_pred() );
+#else
+    new_nnz = HYPRE_THRUST_CALL( count_if,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)) + nnz,
+                                 cabsfirst_greaterthan_second_pred() );
+#endif
+  }
+
+  if (new_nnz == nnz)
+  {
+    hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
+    return hypre_error_flag;
+  }
+
+  if (!A_ii)
+  {
+    A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
+  }
+  new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+  new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+  new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
+
+#ifdef HYPRE_USING_SYCL
+  oneapi::dpl::zip_iterator< HYPRE_Int*, HYPRE_Int*, HYPRE_Complex* > new_end;
+  auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data);
+
+  if (elmt_tols == NULL)
+  {
+    new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                 first, first + nnz,
+                                 A_data,
+                                 oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data),
+                                 std::not_fn(less_than<HYPRE_Complex>(tol)) );
+  }
+  else
+  {
+    new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                 first, first + nnz,
+                                 oneapi::dpl::make_zip_iterator(A_data, elmt_tols),
+                                 oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data),
+                                 cabsfirst_greaterthan_second_pred() );
+  }
+
+  // TODO: abb fix this
+  // hypre_assert( thrust::get<0>(*new_end) == new_ii + new_nnz );
+#else
+  thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*, HYPRE_Complex*> > new_end;
+
+  if (elmt_tols == NULL)
+  {
+    new_end = HYPRE_THRUST_CALL( copy_if,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
+                                 A_data,
+                                 thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
+                                 thrust::not1(less_than<HYPRE_Complex>(tol)) );
+  }
+  else
+  {
+    new_end = HYPRE_THRUST_CALL( copy_if,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
+                                 cabsfirst_greaterthan_second_pred() );
+  }
+
+  hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
+#endif
+
+  hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
+
+  hypre_CSRMatrixNumNonzeros(A) = new_nnz;
+  hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
+  hypre_CSRMatrixJ(A) = new_j;
+  hypre_CSRMatrixData(A) = new_data;
+  hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
+
+  return hypre_error_flag;
+}
+
 void
 hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
                                     HYPRE_Int       *CF_i,
@@ -2619,7 +1915,6 @@ hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
   hypre_SyncDeviceComputeStream(hypre_handle());
 }
 
-
 HYPRE_Int
 hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix  *A,
                                hypre_CSRMatrix **AT_ptr,

From bcf0e579e0e3552d151151e49d52594bdf85e62e Mon Sep 17 00:00:00 2001
From: Abhishek Bagusetty <abagusetty@anl.gov>
Date: Thu, 23 Dec 2021 15:25:07 +0000
Subject: [PATCH 44/44] fix complex data types preprocessor for CUDA, HIP

---
 src/utilities/HYPRE_utilities.h | 2 +-
 src/utilities/complex.c         | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/utilities/HYPRE_utilities.h b/src/utilities/HYPRE_utilities.h
index f8bbb154f8..6ac7ccd255 100644
--- a/src/utilities/HYPRE_utilities.h
+++ b/src/utilities/HYPRE_utilities.h
@@ -86,7 +86,7 @@ typedef double HYPRE_Real;
 
 #if defined(HYPRE_USING_SYCL)
   typedef std::complex<double> HYPRE_Complex;
-#elif defined(HYPRE_USING_GPU)
+#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
   typedef thrust::complex<double> HYPRE_Complex;
 #else
   typedef double _Complex HYPRE_Complex;
diff --git a/src/utilities/complex.c b/src/utilities/complex.c
index ba04d01577..59b71bbf56 100644
--- a/src/utilities/complex.c
+++ b/src/utilities/complex.c
@@ -14,7 +14,7 @@ hypre_conj( HYPRE_Complex value )
 {
 #ifdef HYPRE_USING_SYCL
   return std::conj(value);
-#elif defined(HYPRE_USING_GPU)
+#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
   return thrust::conj(value);
 #else
   return conj(value);
@@ -26,7 +26,7 @@ hypre_cabs( HYPRE_Complex value )
 {
 #ifdef HYPRE_USING_SYCL
   return std::abs(value);
-#elif defined(HYPRE_USING_GPU)
+#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
   return thrust::abs(value);
 #else
   return cabs(value);
@@ -38,7 +38,7 @@ hypre_creal( HYPRE_Complex value )
 {
 #ifdef HYPRE_USING_SYCL
   return std::real(value);
-#elif defined(HYPRE_USING_GPU)
+#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
   return thrust::real(value);
 #else
   return creal(value);
@@ -50,7 +50,7 @@ hypre_cimag( HYPRE_Complex value )
 {
 #ifdef HYPRE_USING_SYCL
   return std::imag(value);
-#elif defined(HYPRE_USING_GPU)
+#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
   return thrust::imag(value);
 #else
   return cimag(value);