diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c
index 9a83b3da62..00f573553f 100644
--- a/src/IJ_mv/IJMatrix_parcsr_device.c
+++ b/src/IJ_mv/IJMatrix_parcsr_device.c
@@ -164,7 +164,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix       *matrix,
       /* mark unwanted elements as -1 */
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(len1, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator,
                          (HYPRE_Int *) row_indexes, ncols, indicator );
 
       auto new_end = HYPRE_THRUST_CALL(
@@ -233,7 +233,7 @@ hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int  N0, HYPRE_BigInt  *I0, HYPRE_Big
    /*
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(N0, "thread", bDim);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 );
    */
 
    /* output X: 0: keep, 1: zero-out */
diff --git a/src/IJ_mv/IJVector_parcsr_device.c b/src/IJ_mv/IJVector_parcsr_device.c
index b34b1162f7..a26d19dd93 100644
--- a/src/IJ_mv/IJVector_parcsr_device.c
+++ b/src/IJ_mv/IJVector_parcsr_device.c
@@ -251,7 +251,7 @@ hypre_IJVectorAssembleParDevice(hypre_IJVector *vector)
       /* set/add to local vector */
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(new_nnz, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i,
                          vec_start, new_sora,
                          hypre_VectorData(hypre_ParVectorLocalVector(par_vector)) );
 
diff --git a/src/config/configure.in b/src/config/configure.in
index 06e6a22796..8edcabc68c 100644
--- a/src/config/configure.in
+++ b/src/config/configure.in
@@ -2316,7 +2316,7 @@ AS_IF([test x"$hypre_using_sycl" == x"yes"],
 
         if test "$hypre_user_chose_cuflags" = "no"
         then
-           CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
+           CUFLAGS="-D_GLIBCXX_USE_TBB_PAR_BACKEND=0 -fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
            if test "$hypre_using_debug" = "yes"
            then
               CUFLAGS="-O0 -Wall -g ${CUFLAGS}"
diff --git a/src/configure b/src/configure
index 7993465afb..66d6707f63 100755
--- a/src/configure
+++ b/src/configure
@@ -9143,7 +9143,7 @@ $as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h
 
         if test "$hypre_user_chose_cuflags" = "no"
         then
-           CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
+           CUFLAGS="-D_GLIBCXX_USE_TBB_PAR_BACKEND=0 -fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
            if test "$hypre_using_debug" = "yes"
            then
               CUFLAGS="-O0 -Wall -g ${CUFLAGS}"
diff --git a/src/distributed_matrix/distributed_matrix_parcsr.c b/src/distributed_matrix/distributed_matrix_parcsr.c
index 0df9ae59e8..e6d986dddb 100644
--- a/src/distributed_matrix/distributed_matrix_parcsr.c
+++ b/src/distributed_matrix/distributed_matrix_parcsr.c
@@ -102,7 +102,7 @@ hypre_DistributedMatrixGetRowParCSR( hypre_DistributedMatrix *matrix,
 
    // RL: if HYPRE_ParCSRMatrixGetRow was on device, need the next line to guarantee it's done
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
    return(ierr);
diff --git a/src/parcsr_ls/ads.c b/src/parcsr_ls/ads.c
index 03c3fccb3d..9288e60b29 100644
--- a/src/parcsr_ls/ads.c
+++ b/src/parcsr_ls/ads.c
@@ -627,12 +627,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nnz, "thread", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                F2V_diag_nnz, 3, F2V_diag_J, Pi_diag_J );
 
             gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
                                F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data,
                                Pi_diag_data );
          }
@@ -693,12 +693,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nnz, "thread", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                F2V_offd_nnz, 3, F2V_offd_J, Pi_offd_J );
 
             gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
                                F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data,
                                Pi_offd_data );
          }
@@ -907,7 +907,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data,
                                Pix_diag_data, Piy_diag_data, Piz_diag_data );
          }
@@ -987,7 +987,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data,
                                Pix_offd_data, Piy_offd_data, Piz_offd_data );
          }
diff --git a/src/parcsr_ls/ame.c b/src/parcsr_ls/ame.c
index 1f4de312c3..f68266d5a8 100644
--- a/src/parcsr_ls/ame.c
+++ b/src/parcsr_ls/ame.c
@@ -496,7 +496,7 @@ HYPRE_Int hypre_AMESetup(void *esolver)
          {
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim);
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim,
                                nv, GtdI, GtdJ, GtdA, GtoI, GtoJ, GtoA, edge_bc, offd_edge_bc );
          }
          else
diff --git a/src/parcsr_ls/ams.c b/src/parcsr_ls/ams.c
index c1d43292a5..9a90c0a71c 100644
--- a/src/parcsr_ls/ams.c
+++ b/src/parcsr_ls/ams.c
@@ -194,7 +194,7 @@ HYPRE_Int hypre_ParVectorBlockSplit(hypre_ParVector *x,
    {
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
                          size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
    }
    else
@@ -241,7 +241,7 @@ HYPRE_Int hypre_ParVectorBlockGather(hypre_ParVector *x,
    {
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
                          size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
    }
    else
@@ -456,10 +456,10 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A)
    bDim = hypre_GetDefaultDeviceBlockDimension();
    gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
+   HYPRE_GPU_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
                      nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd);
 
-   //hypre_SyncCudaComputeStream(hypre_handle());
+   //hypre_SyncDeviceComputeStream(hypre_handle());
 
    return hypre_error_flag;
 }
@@ -787,7 +787,7 @@ HYPRE_Int hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix *A, HYPRE_Real d)
    {
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
                          num_rows, A_diag_I, A_diag_J, A_diag_data, A_offd_I, num_cols_offd, d);
    }
    else
@@ -1623,12 +1623,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                G_diag_nnz, dim, G_diag_J, Pi_diag_J );
 
             gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
                                Pi_diag_data );
          }
@@ -1696,12 +1696,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                G_offd_nnz, dim, G_offd_J, Pi_offd_J );
 
             gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
                                Pi_offd_data );
          }
@@ -1944,7 +1944,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
                                Pix_diag_data, Piy_diag_data, Piz_diag_data );
          }
@@ -2010,7 +2010,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, NULL,
                                Pix_diag_data, Piy_diag_data, NULL );
          }
@@ -2068,7 +2068,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, NULL, NULL,
                                Pix_diag_data, NULL, NULL );
          }
@@ -2145,7 +2145,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
                                Pix_offd_data, Piy_offd_data, Piz_offd_data );
          }
@@ -2227,7 +2227,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, NULL,
                                Pix_offd_data, Piy_offd_data, NULL );
          }
@@ -2299,7 +2299,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, NULL, NULL,
                                Pix_offd_data, NULL, NULL );
          }
@@ -2501,12 +2501,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                G_diag_nnz, dim, G_diag_J, GPi_diag_J );
 
             gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
                                G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
                                GPi_diag_data );
          }
@@ -2575,12 +2575,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
                                G_offd_nnz, dim, G_offd_J, GPi_offd_J );
 
             gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);
 
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
                                G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
                                GPi_offd_data );
          }
@@ -2815,7 +2815,7 @@ HYPRE_Int hypre_AMSSetup(void *solver,
          {
             dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
             dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim);
-            HYPRE_CUDA_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
+            HYPRE_GPU_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
                                nv, G0tdI, G0tdA, G0toI, G0toA, interior_nodes_data );
          }
          else
@@ -3401,7 +3401,7 @@ HYPRE_Int hypre_AMSSetup(void *solver,
                   {
                      dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
                      dim3 gDim = hypre_GetDefaultDeviceGridDimension(Gt_num_rows, "warp", bDim);
-                     HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
+                     HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
                                         Gt_num_rows, Gt_diag_I, Gt_diag_J, Gt_diag_data, Gt_offd_I, Gt_offd_data,
                                         Gx_data, Gy_data, Gz_data );
                   }
diff --git a/src/parcsr_ls/par_2s_interp_device.c b/src/parcsr_ls/par_2s_interp_device.c
index 94156f2e8b..5ad3f91c46 100644
--- a/src/parcsr_ls/par_2s_interp_device.c
+++ b/src/parcsr_ls/par_2s_interp_device.c
@@ -109,7 +109,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix  *A,
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim);
 
    /* only for rows corresponding to F2 (notice flag == -1) */
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
                       A_nr_local,
                       A_offd_nnz > 0,
@@ -160,7 +160,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix  *A,
     * diagnoally scale As_F2F (from both sides) and replace the diagonal */
    gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF,
                       gDim, bDim,
                       AF2F_nr_local,
                       hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)),
@@ -329,7 +329,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix  *A,
    dlam = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE);
    dtmp = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
                       gDim, bDim,
                       AFC_nr_local,
                       hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_FF)),
@@ -388,7 +388,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix  *A,
    gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim);
 
    /* only for rows corresponding to F2 (notice flag == -1) */
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
                       A_nr_local,
                       A_offd_nnz > 0,
@@ -438,7 +438,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix  *A,
     * diagnoally scale As_F2F (from both sides) and replace the diagonal */
    gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF,
                       gDim, bDim,
                       AF2F_nr_local,
                       hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)),
diff --git a/src/parcsr_ls/par_coarsen_device.c b/src/parcsr_ls/par_coarsen_device.c
index 70749d9abc..a2c9fb47e2 100644
--- a/src/parcsr_ls/par_coarsen_device.c
+++ b/src/parcsr_ls/par_coarsen_device.c
@@ -331,7 +331,7 @@ hypre_PMISCoarseningInitDevice( hypre_ParCSRMatrix  *S,               /* in */
    HYPRE_Int *new_end;
 
    /* init CF_marker_diag and measure_diag: remove some special nodes */
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim,
                       num_rows_diag, CF_init, S_diag_i, S_offd_i, measure_diag, CF_marker_diag );
 
    /* communicate for measure_offd */
@@ -494,7 +494,7 @@ hypre_PMISCoarseningUpdateCFDevice( hypre_ParCSRMatrix  *S,               /* in
    bDim = hypre_GetDefaultDeviceBlockDimension();
    gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF,
                       gDim, bDim,
                       graph_diag_size,
                       graph_diag,
diff --git a/src/parcsr_ls/par_gauss_elim.c b/src/parcsr_ls/par_gauss_elim.c
index dcce956b40..85010edbaa 100644
--- a/src/parcsr_ls/par_gauss_elim.c
+++ b/src/parcsr_ls/par_gauss_elim.c
@@ -424,7 +424,7 @@ HYPRE_Int hypre_dgemv_device(HYPRE_Int m, HYPRE_Int n, HYPRE_Int lda, HYPRE_Real
    dim3 bDim(BLOCK_SIZE, 1, 1);
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(m, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y );
 
    return hypre_error_flag;
 }
diff --git a/src/parcsr_ls/par_indepset_device.c b/src/parcsr_ls/par_indepset_device.c
index 3d1d9c60c1..8e40e2c9f6 100644
--- a/src/parcsr_ls/par_indepset_device.c
+++ b/src/parcsr_ls/par_indepset_device.c
@@ -170,7 +170,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix  *S,
    bDim = hypre_GetDefaultDeviceBlockDimension();
    gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim,
                       graph_diag_size, graph_diag, measure_diag, measure_offd,
                       S_diag_i, S_diag_j, S_offd_i, S_offd_j,
                       IS_marker_diag, IS_marker_offd, IS_offd_temp_mark );
@@ -186,7 +186,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix  *S,
    /* adjust IS_marker_diag from the received */
    gDim = hypre_GetDefaultDeviceGridDimension(num_elmts_send, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim,
                       IS_marker_diag, num_elmts_send, send_map_elmts,
                       int_send_buf, IS_offd_temp_mark );
 
diff --git a/src/parcsr_ls/par_interp_device.c b/src/parcsr_ls/par_interp_device.c
index 83139d52ac..3dfac1dca9 100644
--- a/src/parcsr_ls/par_interp_device.c
+++ b/src/parcsr_ls/par_interp_device.c
@@ -197,7 +197,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix   *A,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim,
                       n_fine, S_diag_i, S_diag_j, S_offd_i, S_offd_j,
                       CF_marker, CF_marker_offd, num_functions,
                       dof_func_dev, dof_func_offd, P_diag_i, P_offd_i);
@@ -228,7 +228,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix   *A,
 
    if (interp_type == 3)
    {
-      HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim,
                          n_fine, A_diag_i, A_diag_j, A_diag_data,
                          A_offd_i, A_offd_j, A_offd_data,
                          hypre_ParCSRMatrixSocDiagJ(S),
@@ -241,7 +241,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix   *A,
    }
    else
    {
-      HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim,
                          n_fine, A_diag_i, A_diag_j, A_diag_data,
                          A_offd_i, A_offd_j, A_offd_data,
                          hypre_ParCSRMatrixSocDiagJ(S),
@@ -1161,7 +1161,7 @@ hypre_BoomerAMGBuildInterpOnePntDevice( hypre_ParCSRMatrix  *A,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim,
                       n_fine, A_diag_i, A_strong_diag_j, A_diag_a, A_offd_i, A_strong_offd_j,
                       A_offd_a, CF_marker, CF_marker_offd, diag_compress_marker,
                       offd_compress_marker, P_diag_i, P_diag_j_temp, P_offd_i, P_offd_j_temp);
diff --git a/src/parcsr_ls/par_interp_trunc_device.c b/src/parcsr_ls/par_interp_trunc_device.c
index 2deaf29eff..f73270e4c8 100644
--- a/src/parcsr_ls/par_interp_trunc_device.c
+++ b/src/parcsr_ls/par_interp_trunc_device.c
@@ -162,7 +162,7 @@ hypre_BoomerAMGInterpTruncationDevice( hypre_ParCSRMatrix *P, HYPRE_Real trunc_f
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim,
                       nrows, trunc_factor, max_elmts, P_rowptr, P_j, P_a );
 
    /* build new P_diag and P_offd */
diff --git a/src/parcsr_ls/par_lr_interp_device.c b/src/parcsr_ls/par_lr_interp_device.c
index e21d3e8cdb..23a4d723af 100644
--- a/src/parcsr_ls/par_lr_interp_device.c
+++ b/src/parcsr_ls/par_lr_interp_device.c
@@ -87,7 +87,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix  *A,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
                       A_nr_of_rows,
                       A_offd_nnz > 0,
@@ -128,7 +128,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix  *A,
    /* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */
    hypre_GpuProfilingPushRange("Compute interp matrix");
    gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_aff_afc,
                       gDim, bDim,
                       W_nr_of_rows,
                       hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)),
@@ -273,7 +273,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix  *A,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp",   bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
                       A_nr_of_rows,
                       A_offd_nnz > 0,
@@ -352,7 +352,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix  *A,
 
    hypre_GpuProfilingPushRange("Compute interp matrix");
    gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_twiaff_w,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_twiaff_w,
                       gDim, bDim,
                       W_nr_of_rows,
                       hypre_ParCSRMatrixFirstRowIndex(AFF),
@@ -502,7 +502,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix  *A,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
                       gDim, bDim,
                       A_nr_of_rows,
                       A_offd_nnz > 0,
@@ -545,7 +545,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix  *A,
    dtmp = hypre_TAlloc(HYPRE_Complex, W_nr_of_rows, HYPRE_MEMORY_DEVICE);
    hypre_GpuProfilingPushRange("Compute D_tmp");
    gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
                       gDim, bDim,
                       W_nr_of_rows,
                       hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)),
@@ -587,7 +587,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix  *A,
    /* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */
    hypre_GpuProfilingPushRange("Compute interp matrix");
    gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe,
                       gDim, bDim,
                       W_nr_of_rows,
                       hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)),
diff --git a/src/parcsr_ls/par_lr_restr_device.c b/src/parcsr_ls/par_lr_restr_device.c
index 18bf10fc88..97f3b8be9b 100644
--- a/src/parcsr_ls/par_lr_restr_device.c
+++ b/src/parcsr_ls/par_lr_restr_device.c
@@ -254,7 +254,7 @@ hypre_BoomerAMGBuildRestrNeumannAIRDevice( hypre_ParCSRMatrix   *A,
    /* assemble the diagonal part of R from Z */
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
-   HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim,
                       n_cpts, Fmap, Cmap, Z_diag_i, Z_diag_j, Z_diag_a, R_diag_i, R_diag_j, R_diag_a);
 
    num_cols_offd_R = num_cols_offd_Z;
diff --git a/src/parcsr_ls/par_mod_multi_interp_device.c b/src/parcsr_ls/par_mod_multi_interp_device.c
index 5aea7a00d3..36d20ab022 100644
--- a/src/parcsr_ls/par_mod_multi_interp_device.c
+++ b/src/parcsr_ls/par_mod_multi_interp_device.c
@@ -343,7 +343,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix  *A,
          dim3 gDim = hypre_GetDefaultDeviceGridDimension(remaining, "warp", bDim);
 
          /* output diag_shifts is 0/1 indicating if points_left_dev[i] is picked in this pass */
-         HYPRE_CUDA_LAUNCH( hypreCUDAKernel_pass_order_count,
+         HYPRE_GPU_LAUNCH( hypreCUDAKernel_pass_order_count,
                             gDim, bDim,
                             remaining,
                             current_pass,
@@ -438,7 +438,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_cfmarker_masked_rowsum, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_cfmarker_masked_rowsum, gDim, bDim,
                          n_fine, A_diag_i, A_diag_j, A_diag_data,
                          A_offd_i, A_offd_j, A_offd_data,
                          CF_marker,
@@ -591,7 +591,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_insert_remaining_weights, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_insert_remaining_weights, gDim, bDim,
                          pass_starts[p + 1], pass_starts[p + 2], pass_order,
                          Pi_diag_i, Pi_diag_j, Pi_diag_data,
                          P_diag_i, P_diag_j, P_diag_data,
@@ -654,7 +654,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix  *A,
          dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
          dim3 gDim = hypre_GetDefaultDeviceGridDimension(npoints, "warp", bDim);
 
-         HYPRE_CUDA_LAUNCH( hypreCUDAKernel_populate_big_P_offd_j, gDim, bDim,
+         HYPRE_GPU_LAUNCH( hypreCUDAKernel_populate_big_P_offd_j, gDim, bDim,
                             pass_starts[p + 1],
                             pass_starts[p + 2],
                             pass_order,
@@ -893,7 +893,7 @@ hypre_GenerateMultipassPiDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
                          num_points, color, pass_order, pass_marker, pass_marker_offd,
                          S_diag_i, S_diag_j, S_offd_i, S_offd_j,
                          P_diag_i, P_offd_i );
@@ -921,7 +921,7 @@ hypre_GenerateMultipassPiDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_j_Poffd_j, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_j_Poffd_j, gDim, bDim,
                          num_points,
                          color,
                          pass_order,
@@ -1144,7 +1144,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim,
                          num_points, color, pass_order, pass_marker, pass_marker_offd,
                          S_diag_i, S_diag_j, S_offd_i, S_offd_j,
                          Q_diag_i, Q_offd_i );
@@ -1173,7 +1173,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Qdiag_j_Qoffd_j, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Qdiag_j_Qoffd_j, gDim, bDim,
                          num_points,
                          color,
                          pass_order,
@@ -1244,7 +1244,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix  *A,
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_mutli_pi_rowsum, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_mutli_pi_rowsum, gDim, bDim,
                          num_points, pass_order, A_diag_i, A_diag_data,
                          Pi_diag_i, Pi_diag_data, Pi_offd_i, Pi_offd_data,
                          w_row_sum );
diff --git a/src/parcsr_ls/par_relax.c b/src/parcsr_ls/par_relax.c
index 608bc4209d..63d6b7df03 100644
--- a/src/parcsr_ls/par_relax.c
+++ b/src/parcsr_ls/par_relax.c
@@ -1117,8 +1117,8 @@ hypre_BoomerAMGRelax7Jacobi( hypre_ParCSRMatrix *A,
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
    HYPRE_Int sync_stream;
-   hypre_GetSyncCudaCompute(&sync_stream);
-   hypre_SetSyncCudaCompute(0);
+   hypre_GetSyncDeviceCompute(&sync_stream);
+   hypre_SetSyncDeviceCompute(0);
 #endif
 
    /*-----------------------------------------------------------------
@@ -1144,8 +1144,8 @@ hypre_BoomerAMGRelax7Jacobi( hypre_ParCSRMatrix *A,
    }
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
-   hypre_SetSyncCudaCompute(sync_stream);
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SetSyncDeviceCompute(sync_stream);
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
    return hypre_error_flag;
diff --git a/src/parcsr_ls/par_relax_more_device.c b/src/parcsr_ls/par_relax_more_device.c
index 00f6f639cf..3388da1f82 100644
--- a/src/parcsr_ls/par_relax_more_device.c
+++ b/src/parcsr_ls/par_relax_more_device.c
@@ -155,7 +155,7 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A,
 
    bDim = hypre_GetDefaultDeviceBlockDimension();
    gDim = hypre_GetDefaultDeviceGridDimension(A_num_rows, "warp", bDim);
-   HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate,
+   HYPRE_GPU_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate,
                      gDim,
                      bDim,
                      A_num_rows,
@@ -169,7 +169,7 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A,
                      rowsums_upper,
                      scale);
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    e_min = HYPRE_THRUST_CALL(reduce, rowsums_lower, rowsums_lower + A_num_rows, (HYPRE_Real)0,
                              thrust::minimum<HYPRE_Real>());
@@ -323,7 +323,7 @@ hypre_ParCSRMaxEigEstimateCGDevice(hypre_ParCSRMatrix *A,     /* matrix to relax
    /* set residual to random */
    hypre_CurandUniform(local_size, r_data, 0, 0, 0, 0);
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    HYPRE_THRUST_CALL(transform,
                      r_data, r_data + local_size, r_data,
diff --git a/src/parcsr_ls/par_strength_device.c b/src/parcsr_ls/par_strength_device.c
index af6d9b0ad2..3f884ee92b 100644
--- a/src/parcsr_ls/par_strength_device.c
+++ b/src/parcsr_ls/par_strength_device.c
@@ -140,7 +140,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix    *A,
 
    if (abs_soc)
    {
-      HYPRE_CUDA_LAUNCH( hypre_BoomerAMGCreateSabs_rowcount, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_BoomerAMGCreateSabs_rowcount, gDim, bDim,
                          num_variables, max_row_sum, strength_threshold,
                          A_diag_data, A_diag_i, A_diag_j,
                          A_offd_data, A_offd_i, A_offd_j,
@@ -150,7 +150,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix    *A,
    }
    else
    {
-      HYPRE_CUDA_LAUNCH( hypre_BoomerAMGCreateS_rowcount, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_BoomerAMGCreateS_rowcount, gDim, bDim,
                          num_variables, max_row_sum, strength_threshold,
                          A_diag_data, A_diag_i, A_diag_j,
                          A_offd_data, A_offd_i, A_offd_j,
diff --git a/src/parcsr_mv/par_csr_communication.c b/src/parcsr_mv/par_csr_communication.c
index 35fef28c8d..9786d21d31 100644
--- a/src/parcsr_mv/par_csr_communication.c
+++ b/src/parcsr_mv/par_csr_communication.c
@@ -434,7 +434,7 @@ hypre_ParCSRCommHandleCreate_v2 ( HYPRE_Int            job,
    recv_data = recv_data_in;
    // TODO RL: it seems that we need to sync the CUDA stream before doing GPU-GPU MPI.
    // Need to check MPI documentation whether this is acutally true
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
    num_requests = num_sends + num_recvs;
diff --git a/src/parcsr_mv/par_csr_matop.c b/src/parcsr_mv/par_csr_matop.c
index 8eeb6dcf4c..97552f4aa1 100644
--- a/src/parcsr_mv/par_csr_matop.c
+++ b/src/parcsr_mv/par_csr_matop.c
@@ -4113,7 +4113,7 @@ hypre_ParTMatmul( hypre_ParCSRMatrix  *A,
    if ( hypre_GetExecPolicy2(memory_location_A, memory_location_B) == HYPRE_EXEC_DEVICE )
    {
       hypre_CSRMatrixMoveDiagFirstDevice(hypre_ParCSRMatrixDiag(C));
-      hypre_SyncCudaComputeStream(hypre_handle());
+      hypre_SyncDeviceComputeStream(hypre_handle());
    }
 #endif
 
diff --git a/src/parcsr_mv/par_csr_matop_device.c b/src/parcsr_mv/par_csr_matop_device.c
index 9387a863f8..992dea4964 100644
--- a/src/parcsr_mv/par_csr_matop_device.c
+++ b/src/parcsr_mv/par_csr_matop_device.c
@@ -306,7 +306,7 @@ hypre_MergeDiagAndOffdDevice(hypre_ParCSRMatrix *A)
    hypre_CSRMatrixData(B) = B_a;
    hypre_CSRMatrixMemoryLocation(B) = HYPRE_MEMORY_DEVICE;
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    return B;
 }
@@ -628,7 +628,7 @@ hypre_ConcatDiagAndOffdDevice(hypre_ParCSRMatrix *A)
    const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    const dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
                       gDim, bDim,
                       hypre_CSRMatrixNumRows(A_diag),
                       hypre_CSRMatrixNumCols(A_diag),
@@ -745,7 +745,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_ParCSRMatrixNumRows(A), "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
                       gDim, bDim,
                       hypre_CSRMatrixNumRows(A_diag),
                       hypre_CSRMatrixNumCols(A_diag),
@@ -777,7 +777,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A,
 
    hypre_assert(hypre_CSRMatrixNumCols(E_diag) == hypre_CSRMatrixNumCols(A_diag));
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd,
                       gDim, bDim,
                       hypre_CSRMatrixNumRows(E_diag),
                       hypre_CSRMatrixNumCols(E_diag),
@@ -1044,7 +1044,7 @@ hypre_ParCSRMatrixGetRowDevice( hypre_ParCSRMatrix  *mat,
       *values = hypre_ParCSRMatrixRowvalues(mat);
    }
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    return hypre_error_flag;
 }
@@ -1211,21 +1211,21 @@ hypre_ParCSRMatrixDropSmallEntriesDevice( hypre_ParCSRMatrix *A,
 
    if (type == -1)
    {
-      HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols < -1 >, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols < -1 >, gDim, bDim,
                          hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag),
                          hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd),
                          hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd);
    }
    if (type == 1)
    {
-      HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<1>, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<1>, gDim, bDim,
                          hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag),
                          hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd),
                          hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd);
    }
    if (type == 2)
    {
-      HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<2>, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<2>, gDim, bDim,
                          hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag),
                          hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd),
                          hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd);
@@ -1603,7 +1603,7 @@ hypre_ParCSRDiagScale( HYPRE_ParCSRMatrix HA,
    HYPRE_Int ierr = 0;
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
    hypreDevice_DiagScaleVector(local_size, A_i, A_data, y_data, 0.0, x_data);
-   //hypre_SyncCudaComputeStream(hypre_handle());
+   //hypre_SyncDeviceComputeStream(hypre_handle());
 #else /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
    HYPRE_Int i;
 #if defined(HYPRE_USING_DEVICE_OPENMP)
diff --git a/src/parcsr_mv/par_csr_matvec.c b/src/parcsr_mv/par_csr_matvec.c
index 30921fe960..d53f74a9d8 100644
--- a/src/parcsr_mv/par_csr_matvec.c
+++ b/src/parcsr_mv/par_csr_matvec.c
@@ -56,8 +56,8 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
 
 #if defined(HYPRE_USING_GPU)
    HYPRE_Int sync_stream;
-   hypre_GetSyncCudaCompute(&sync_stream);
-   hypre_SetSyncCudaCompute(0);
+   hypre_GetSyncDeviceCompute(&sync_stream);
+   hypre_SetSyncDeviceCompute(0);
 #endif
 
    HYPRE_ANNOTATE_FUNC_BEGIN;
@@ -348,8 +348,8 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex       alpha,
    }
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SetSyncCudaCompute(sync_stream);
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SetSyncDeviceCompute(sync_stream);
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -415,8 +415,8 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex       alpha,
 
 #if defined(HYPRE_USING_GPU)
    HYPRE_Int sync_stream;
-   hypre_GetSyncCudaCompute(&sync_stream);
-   hypre_SetSyncCudaCompute(0);
+   hypre_GetSyncDeviceCompute(&sync_stream);
+   hypre_SetSyncDeviceCompute(0);
 #endif
 
    HYPRE_ANNOTATE_FUNC_BEGIN;
@@ -724,8 +724,8 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex       alpha,
    }
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SetSyncCudaCompute(sync_stream);
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SetSyncDeviceCompute(sync_stream);
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
diff --git a/src/parcsr_mv/par_csr_triplemat_device.c b/src/parcsr_mv/par_csr_triplemat_device.c
index 0b8a67fd63..5c77572e04 100644
--- a/src/parcsr_mv/par_csr_triplemat_device.c
+++ b/src/parcsr_mv/par_csr_triplemat_device.c
@@ -497,7 +497,7 @@ hypre_ParCSRTMatMatKTDevice( hypre_ParCSRMatrix  *A,
 
    hypre_assert(!hypre_CSRMatrixCheckDiagFirstDevice(hypre_ParCSRMatrixDiag(C)));
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    return C;
 }
@@ -817,7 +817,7 @@ hypre_ParCSRMatrixRAPKTDevice( hypre_ParCSRMatrix *R,
 
    hypre_assert(!hypre_CSRMatrixCheckDiagFirstDevice(hypre_ParCSRMatrixDiag(C)));
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    return C;
 }
diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c
index 5f56789ae4..bacc0b28fe 100644
--- a/src/seq_mv/csr_matop_device.c
+++ b/src/seq_mv/csr_matop_device.c
@@ -110,73 +110,470 @@ hypre_GpuMatDataDestroy(hypre_GpuMatData *data)
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
-hypre_CSRMatrix*
-hypre_CSRMatrixAddDevice ( HYPRE_Complex    alpha,
-                           hypre_CSRMatrix *A,
-                           HYPRE_Complex    beta,
-                           hypre_CSRMatrix *B     )
+HYPRE_Int
+hypre_CSRMatrixSplitDevice_core( HYPRE_Int         job,                 /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */
+                                 HYPRE_Int         num_rows,
+                                 HYPRE_Int         B_ext_nnz,
+                                 HYPRE_Int        *B_ext_ii,            /* Note: this is NOT row pointers as in CSR but row indices as in COO */
+                                 HYPRE_BigInt     *B_ext_bigj,          /* Note: [BigInt] global column indices */
+                                 HYPRE_Complex    *B_ext_data,
+                                 char             *B_ext_xata,          /* companion data with B_ext_data; NULL if none */
+                                 HYPRE_BigInt      first_col_diag_B,
+                                 HYPRE_BigInt      last_col_diag_B,
+                                 HYPRE_Int         num_cols_offd_B,
+                                 HYPRE_BigInt     *col_map_offd_B,
+                                 HYPRE_Int       **map_B_to_C_ptr,
+                                 HYPRE_Int        *num_cols_offd_C_ptr,
+                                 HYPRE_BigInt    **col_map_offd_C_ptr,
+                                 HYPRE_Int        *B_ext_diag_nnz_ptr,
+                                 HYPRE_Int        *B_ext_diag_ii,       /* memory allocated outside */
+                                 HYPRE_Int        *B_ext_diag_j,
+                                 HYPRE_Complex    *B_ext_diag_data,
+                                 char             *B_ext_diag_xata,     /* companion with B_ext_diag_data_ptr; NULL if none */
+                                 HYPRE_Int        *B_ext_offd_nnz_ptr,
+                                 HYPRE_Int        *B_ext_offd_ii,       /* memory allocated outside */
+                                 HYPRE_Int        *B_ext_offd_j,
+                                 HYPRE_Complex    *B_ext_offd_data,
+                                 char             *B_ext_offd_xata      /* companion with B_ext_offd_data_ptr; NULL if none */ )
 {
-   HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
-   HYPRE_Int        *A_i      = hypre_CSRMatrixI(A);
-   HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
-   HYPRE_Int         nrows_A  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
-   HYPRE_Int         nnz_A    = hypre_CSRMatrixNumNonzeros(A);
-   HYPRE_Complex    *B_data   = hypre_CSRMatrixData(B);
-   HYPRE_Int        *B_i      = hypre_CSRMatrixI(B);
-   HYPRE_Int        *B_j      = hypre_CSRMatrixJ(B);
-   HYPRE_Int         nrows_B  = hypre_CSRMatrixNumRows(B);
-   HYPRE_Int         ncols_B  = hypre_CSRMatrixNumCols(B);
-   HYPRE_Int         nnz_B    = hypre_CSRMatrixNumNonzeros(B);
-   HYPRE_Complex    *C_data;
-   HYPRE_Int        *C_i;
-   HYPRE_Int        *C_j;
-   HYPRE_Int         nnzC;
-   hypre_CSRMatrix  *C;
-
-   if (nrows_A != nrows_B || ncols_A != ncols_B)
-   {
-      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! Incompatible matrix dimensions!\n");
+  HYPRE_Int      B_ext_diag_nnz;
+  HYPRE_Int      B_ext_offd_nnz;
+  HYPRE_BigInt  *B_ext_diag_bigj = NULL;
+  HYPRE_BigInt  *B_ext_offd_bigj = NULL;
+  HYPRE_BigInt  *col_map_offd_C;
+  HYPRE_Int     *map_B_to_C = NULL;
+  HYPRE_Int      num_cols_offd_C;
+
+  in_range<HYPRE_BigInt> pred1(first_col_diag_B, last_col_diag_B);
+
+  /* get diag and offd nnz */
+  if (job == 0)
+  {
+    /* query the nnz's */
+    B_ext_diag_nnz = HYPRE_THRUST_CALL( count_if,
+                                        B_ext_bigj,
+                                        B_ext_bigj + B_ext_nnz,
+                                        pred1 );
+    B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz;
+
+    *B_ext_diag_nnz_ptr = B_ext_diag_nnz;
+    *B_ext_offd_nnz_ptr = B_ext_offd_nnz;
+
+    return hypre_error_flag;
+  }
+  else
+  {
+    B_ext_diag_nnz = *B_ext_diag_nnz_ptr;
+    B_ext_offd_nnz = *B_ext_offd_nnz_ptr;
+  }
+
+  /* copy to diag */
+  B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+
+  if (B_ext_diag_xata)
+  {
+    auto new_end = HYPRE_THRUST_CALL(
+      copy_if,
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata)),             /* first */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata)) + B_ext_nnz, /* last */
+      B_ext_bigj,                                                                                                          /* stencil */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata)),     /* result */
+      pred1 );
+
+    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz );
+  }
+  else
+  {
+    auto new_end = HYPRE_THRUST_CALL(
+      copy_if,
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data)),             /* first */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data)) + B_ext_nnz, /* last */
+      B_ext_bigj,                                                                                            /* stencil */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data)),        /* result */
+      pred1 );
+
+    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz );
+  }
+
+  HYPRE_THRUST_CALL( transform,
+                     B_ext_diag_bigj,
+                     B_ext_diag_bigj + B_ext_diag_nnz,
+                     thrust::make_constant_iterator(first_col_diag_B),
+                     B_ext_diag_j,
+                     thrust::minus<HYPRE_BigInt>());
+
+  hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE);
+
+  /* copy to offd */
+  B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
+
+  if (B_ext_offd_xata)
+  {
+    auto new_end = HYPRE_THRUST_CALL(
+      copy_if,
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata)),             /* first */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata)) + B_ext_nnz, /* last */
+      B_ext_bigj,                                                                                                          /* stencil */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata)),     /* result */
+      thrust::not1(pred1) );
+
+    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
+  }
+  else
+  {
+    auto new_end = HYPRE_THRUST_CALL(
+      copy_if,
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data)),             /* first */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data)) + B_ext_nnz, /* last */
+      B_ext_bigj,                                                                                            /* stencil */
+      thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data)),        /* result */
+      thrust::not1(pred1) );
+
+    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
+  }
+
+  /* offd map of B_ext_offd Union col_map_offd_B */
+  col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(col_map_offd_C,                  B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz,  HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B,  HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+  HYPRE_THRUST_CALL( sort,
+                     col_map_offd_C,
+                     col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
+
+  HYPRE_BigInt *new_end = HYPRE_THRUST_CALL( unique,
+                                             col_map_offd_C,
+                                             col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
+
+  num_cols_offd_C = new_end - col_map_offd_C;
 
-      return NULL;
-   }
+#if 1
+  HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE);
+  col_map_offd_C = tmp;
+#else
+  col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
+#endif
+
+  /* create map from col_map_offd_B */
+  if (num_cols_offd_B)
+  {
+    map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE);
+    HYPRE_THRUST_CALL( lower_bound,
+                       col_map_offd_C,
+                       col_map_offd_C + num_cols_offd_C,
+                       col_map_offd_B,
+                       col_map_offd_B + num_cols_offd_B,
+                       map_B_to_C );
+  }
+
+  HYPRE_THRUST_CALL( lower_bound,
+                     col_map_offd_C,
+                     col_map_offd_C + num_cols_offd_C,
+                     B_ext_offd_bigj,
+                     B_ext_offd_bigj + B_ext_offd_nnz,
+                     B_ext_offd_j );
+
+  hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE);
+
+  if (map_B_to_C_ptr)
+  {
+    *map_B_to_C_ptr   = map_B_to_C;
+  }
+  *num_cols_offd_C_ptr = num_cols_offd_C;
+  *col_map_offd_C_ptr  = col_map_offd_C;
+
+  return hypre_error_flag;
+}
+
+typedef thrust::tuple<HYPRE_Int, HYPRE_Int> Int2;
+struct Int2Unequal : public thrust::unary_function<Int2, bool>
+{
+  __host__ __device__
+    bool operator()(const Int2& t) const
+  {
+    return (thrust::get<0>(t) != thrust::get<1>(t));
+  }
+};
+
+/* this predicate compares first and second element in a tuple in absolute value */
+/* first is assumed to be complex, second to be real > 0 */
+struct cabsfirst_greaterthan_second_pred : public thrust::unary_function<thrust::tuple<HYPRE_Complex, HYPRE_Real>,bool>
+{
+  __host__ __device__
+    bool operator()(const thrust::tuple<HYPRE_Complex, HYPRE_Real>& t) const
+  {
+    const HYPRE_Complex i = thrust::get<0>(t);
+    const HYPRE_Real j = thrust::get<1>(t);
+
+    return hypre_cabs(i) > j;
+  }
+};
+
+#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */
+
+#if defined(HYPRE_USING_SYCL)
+
+HYPRE_Int
+hypre_CSRMatrixSplitDevice_core( HYPRE_Int         job,                 /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */
+                                 HYPRE_Int         num_rows,
+                                 HYPRE_Int         B_ext_nnz,
+                                 HYPRE_Int        *B_ext_ii,            /* Note: this is NOT row pointers as in CSR but row indices as in COO */
+                                 HYPRE_BigInt     *B_ext_bigj,          /* Note: [BigInt] global column indices */
+                                 HYPRE_Complex    *B_ext_data,
+                                 char             *B_ext_xata,          /* companion data with B_ext_data; NULL if none */
+                                 HYPRE_BigInt      first_col_diag_B,
+                                 HYPRE_BigInt      last_col_diag_B,
+                                 HYPRE_Int         num_cols_offd_B,
+                                 HYPRE_BigInt     *col_map_offd_B,
+                                 HYPRE_Int       **map_B_to_C_ptr,
+                                 HYPRE_Int        *num_cols_offd_C_ptr,
+                                 HYPRE_BigInt    **col_map_offd_C_ptr,
+                                 HYPRE_Int        *B_ext_diag_nnz_ptr,
+                                 HYPRE_Int        *B_ext_diag_ii,       /* memory allocated outside */
+                                 HYPRE_Int        *B_ext_diag_j,
+                                 HYPRE_Complex    *B_ext_diag_data,
+                                 char             *B_ext_diag_xata,     /* companion with B_ext_diag_data_ptr; NULL if none */
+                                 HYPRE_Int        *B_ext_offd_nnz_ptr,
+                                 HYPRE_Int        *B_ext_offd_ii,       /* memory allocated outside */
+                                 HYPRE_Int        *B_ext_offd_j,
+                                 HYPRE_Complex    *B_ext_offd_data,
+                                 char             *B_ext_offd_xata      /* companion with B_ext_offd_data_ptr; NULL if none */ )
+{
+  HYPRE_Int      B_ext_diag_nnz;
+  HYPRE_Int      B_ext_offd_nnz;
+  HYPRE_BigInt  *B_ext_diag_bigj = NULL;
+  HYPRE_BigInt  *B_ext_offd_bigj = NULL;
+  HYPRE_BigInt  *col_map_offd_C;
+  HYPRE_Int     *map_B_to_C = NULL;
+  HYPRE_Int      num_cols_offd_C;
+
+  in_range<HYPRE_BigInt> pred1(first_col_diag_B, last_col_diag_B);
+
+  /* get diag and offd nnz */
+  if (job == 0) {
+    /* query the nnz's */
+    B_ext_diag_nnz = HYPRE_ONEDPL_CALL( std::count_if,
+                                        B_ext_bigj,
+                                        B_ext_bigj + B_ext_nnz,
+                                        pred1 );
+    B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz;
+
+    *B_ext_diag_nnz_ptr = B_ext_diag_nnz;
+    *B_ext_offd_nnz_ptr = B_ext_offd_nnz;
+
+    return hypre_error_flag;
+  }
+  else {
+    B_ext_diag_nnz = *B_ext_diag_nnz_ptr;
+    B_ext_offd_nnz = *B_ext_offd_nnz_ptr;
+  }
+
+  /* copy to diag */
+  B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+
+  if (B_ext_diag_xata) {
+    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first,                                                                                           /* first */
+                                      first + B_ext_nnz,                                                                               /* last */
+                                      B_ext_bigj,                                                                                      /* stencil */
+                                      oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata),/* result */
+                                      pred1 );
+
+    //hypre_assert( std::get<0>(new_end.get_iterator_tuple() == B_ext_diag_ii + B_ext_diag_nnz );
+  }
+  else {
+    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first,                                                                             /* first */
+                                      first + B_ext_nnz,                                                                 /* last */
+                                      B_ext_bigj,                                                                        /* stencil */
+                                      oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data),   /* result */
+                                      pred1 );
+
+    //hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz );
+  }
+
+  HYPRE_BigInt *const_iterator = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+  hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, first_col_diag_B, B_ext_diag_nnz*sizeof(HYPRE_BigInt)).wait();
+  HYPRE_ONEDPL_CALL( std::transform,
+                     B_ext_diag_bigj,
+                     B_ext_diag_bigj + B_ext_diag_nnz,
+                     const_iterator, //dpct::make_constant_iterator(first_col_diag_B),
+                     B_ext_diag_j,
+                     std::minus<HYPRE_BigInt>() );
+  hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE);
+
+  hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE);
+
+  /* copy to offd */
+  B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
+
+  if (B_ext_offd_xata) {
+    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data,   B_ext_xata);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first,                                                                                            /* first */
+                                      first + B_ext_nnz,                                                                                /* last */
+                                      B_ext_bigj,                                                                                       /* stencil */
+                                      oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata), /* result */
+                                      std::not_fn(pred1) );
+
+    // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
+  }
+  else {
+    auto first = oneapi::dpl::make_zip_iterator(B_ext_ii,      B_ext_bigj,      B_ext_data);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first,                                                                           /* first */
+                                      first + B_ext_nnz,                                                               /* last */
+                                      B_ext_bigj,                                                                      /* stencil */
+                                      oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data), /* result */
+                                      std::not_fn(pred1) );
+
+    // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
+  }
+
+  /* offd map of B_ext_offd Union col_map_offd_B */
+  col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(col_map_offd_C,                  B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz,  HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B,  HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+  HYPRE_ONEDPL_CALL( std::sort,
+                     col_map_offd_C,
+                     col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
+
+  HYPRE_BigInt *new_end = HYPRE_ONEDPL_CALL( std::unique,
+                                             col_map_offd_C,
+                                             col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
+
+  num_cols_offd_C = new_end - col_map_offd_C;
+
+#if 1
+  HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE,
+                HYPRE_MEMORY_DEVICE);
+  hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE);
+  col_map_offd_C = tmp;
+#else
+  col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B,
+                                     HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
+#endif
+
+  /* create map from col_map_offd_B */
+  if (num_cols_offd_B) {
+    map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE);
+    HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
+                       col_map_offd_C,
+                       col_map_offd_C + num_cols_offd_C,
+                       col_map_offd_B,
+                       col_map_offd_B + num_cols_offd_B,
+                       map_B_to_C );
+  }
+
+  HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
+                     col_map_offd_C,
+                     col_map_offd_C + num_cols_offd_C,
+                     B_ext_offd_bigj,
+                     B_ext_offd_bigj + B_ext_offd_nnz,
+                     B_ext_offd_j );
+
+  hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE);
+
+  if (map_B_to_C_ptr) {
+    *map_B_to_C_ptr   = map_B_to_C;
+  }
+  *num_cols_offd_C_ptr = num_cols_offd_C;
+  *col_map_offd_C_ptr  = col_map_offd_C;
+
+  return hypre_error_flag;
+}
+
+/* this predicate compares first and second element in a tuple in absolute value */
+/* first is assumed to be complex, second to be real > 0 */
+struct cabsfirst_greaterthan_second_pred
+{
+  bool operator()(const std::tuple<HYPRE_Complex, HYPRE_Real>& t) const
+    {
+      const HYPRE_Complex i = std::get<0>(t);
+      const HYPRE_Real j = std::get<1>(t);
+
+      return hypre_cabs(i) > j;
+    }
+};
 
-   hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B,
-                        A_i, A_j, alpha, A_data, NULL, B_i, B_j, beta, B_data, NULL, NULL,
-                        &nnzC, &C_i, &C_j, &C_data);
+#endif /* HYPRE_USING_SYCL */
 
-   C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC);
-   hypre_CSRMatrixI(C) = C_i;
-   hypre_CSRMatrixJ(C) = C_j;
-   hypre_CSRMatrixData(C) = C_data;
-   hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+#if defined(HYPRE_USING_GPU)
 
-   return C;
+hypre_CSRMatrix*
+hypre_CSRMatrixAddDevice ( HYPRE_Complex    alpha,
+                           hypre_CSRMatrix *A,
+                           HYPRE_Complex    beta,
+                           hypre_CSRMatrix *B     )
+{
+  HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
+  HYPRE_Int        *A_i      = hypre_CSRMatrixI(A);
+  HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
+  HYPRE_Int         nrows_A  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
+  HYPRE_Int         nnz_A    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Complex    *B_data   = hypre_CSRMatrixData(B);
+  HYPRE_Int        *B_i      = hypre_CSRMatrixI(B);
+  HYPRE_Int        *B_j      = hypre_CSRMatrixJ(B);
+  HYPRE_Int         nrows_B  = hypre_CSRMatrixNumRows(B);
+  HYPRE_Int         ncols_B  = hypre_CSRMatrixNumCols(B);
+  HYPRE_Int         nnz_B    = hypre_CSRMatrixNumNonzeros(B);
+  HYPRE_Complex    *C_data;
+  HYPRE_Int        *C_i;
+  HYPRE_Int        *C_j;
+  HYPRE_Int         nnzC;
+  hypre_CSRMatrix  *C;
+
+  if (nrows_A != nrows_B || ncols_A != ncols_B)
+  {
+    hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! Incompatible matrix dimensions!\n");
+
+    return NULL;
+  }
+
+  hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B,
+                       A_i, A_j, alpha, A_data, NULL, B_i, B_j, beta, B_data, NULL, NULL,
+                       &nnzC, &C_i, &C_j, &C_data);
+
+  C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC);
+  hypre_CSRMatrixI(C) = C_i;
+  hypre_CSRMatrixJ(C) = C_j;
+  hypre_CSRMatrixData(C) = C_data;
+  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
+
+  hypre_SyncDeviceComputeStream(hypre_handle());
+
+  return C;
 }
 
 hypre_CSRMatrix*
 hypre_CSRMatrixMultiplyDevice( hypre_CSRMatrix *A,
                                hypre_CSRMatrix *B)
 {
-   HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
-   HYPRE_Int         nrows_B  = hypre_CSRMatrixNumRows(B);
-   hypre_CSRMatrix  *C;
+  HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
+  HYPRE_Int         nrows_B  = hypre_CSRMatrixNumRows(B);
+  hypre_CSRMatrix  *C;
 
-   if (ncols_A != nrows_B)
-   {
-      hypre_printf("Warning! incompatible matrix dimensions!\n");
-      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! incompatible matrix dimensions!\n");
+  if (ncols_A != nrows_B)
+  {
+    hypre_printf("Warning! incompatible matrix dimensions!\n");
+    hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! incompatible matrix dimensions!\n");
 
-      return NULL;
-   }
+    return NULL;
+  }
 
-   hypreDevice_CSRSpGemm(A, B, &C);
+  hypreDevice_CSRSpGemm(A, B, &C);
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
-   return C;
+  return C;
 }
 
 hypre_CSRMatrix*
@@ -184,12 +581,12 @@ hypre_CSRMatrixTripleMultiplyDevice ( hypre_CSRMatrix *A,
                                       hypre_CSRMatrix *B,
                                       hypre_CSRMatrix *C )
 {
-   hypre_CSRMatrix *BC  = hypre_CSRMatrixMultiplyDevice(B, C);
-   hypre_CSRMatrix *ABC = hypre_CSRMatrixMultiplyDevice(A, BC);
+  hypre_CSRMatrix *BC  = hypre_CSRMatrixMultiplyDevice(B, C);
+  hypre_CSRMatrix *ABC = hypre_CSRMatrixMultiplyDevice(A, BC);
 
-   hypre_CSRMatrixDestroy(BC);
+  hypre_CSRMatrixDestroy(BC);
 
-   return ABC;
+  return ABC;
 }
 
 HYPRE_Int
@@ -200,14 +597,13 @@ hypre_CSRMatrixTriLowerUpperSolveDevice(char             uplo,
                                         hypre_Vector    *u )
 {
 #if defined(HYPRE_USING_CUSPARSE)
-   hypre_CSRMatrixTriLowerUpperSolveCusparse(uplo, A, l1_norms, f, u);
+  hypre_CSRMatrixTriLowerUpperSolveCusparse(uplo, A, l1_norms, f, u);
 #elif defined(HYPRE_USING_ROCSPARSE)
-   hypre_CSRMatrixTriLowerUpperSolveRocsparse(uplo, A, l1_norms, f, u);
+  hypre_CSRMatrixTriLowerUpperSolveRocsparse(uplo, A, l1_norms, f, u);
 #else
-   hypre_error_w_msg(HYPRE_ERROR_GENERIC,
-                     "hypre_CSRMatrixTriLowerUpperSolveDevice requires configuration with either cusparse or rocsparse\n");
+  hypre_error_w_msg(HYPRE_ERROR_GENERIC, "hypre_CSRMatrixTriLowerUpperSolveDevice requires configuration with either cusparse or rocsparse\n");
 #endif
-   return hypre_error_flag;
+  return hypre_error_flag;
 }
 
 /* split CSR matrix B_ext (extended rows of parcsr B) into diag part and offd part
@@ -229,301 +625,105 @@ hypre_CSRMatrixSplitDevice( hypre_CSRMatrix  *B_ext,
                             hypre_CSRMatrix **B_ext_diag_ptr,
                             hypre_CSRMatrix **B_ext_offd_ptr )
 {
-   HYPRE_Int num_rows = hypre_CSRMatrixNumRows(B_ext);
-   HYPRE_Int B_ext_nnz = hypre_CSRMatrixNumNonzeros(B_ext);
-
-   HYPRE_Int *B_ext_ii = hypre_TAlloc(HYPRE_Int, B_ext_nnz, HYPRE_MEMORY_DEVICE);
-   hypreDevice_CsrRowPtrsToIndices_v2(num_rows, B_ext_nnz, hypre_CSRMatrixI(B_ext), B_ext_ii);
-
-   HYPRE_Int B_ext_diag_nnz;
-   HYPRE_Int B_ext_offd_nnz;
-   HYPRE_Int ierr;
-
-   ierr = hypre_CSRMatrixSplitDevice_core( 0,
-                                           num_rows,
-                                           B_ext_nnz,
-                                           NULL,
-                                           hypre_CSRMatrixBigJ(B_ext),
-                                           NULL,
-                                           NULL,
-                                           first_col_diag_B,
-                                           last_col_diag_B,
-                                           num_cols_offd_B,
-                                           NULL,
-                                           NULL,
-                                           NULL,
-                                           NULL,
-                                           &B_ext_diag_nnz,
-                                           NULL,
-                                           NULL,
-                                           NULL,
-                                           NULL,
-                                           &B_ext_offd_nnz,
-                                           NULL,
-                                           NULL,
-                                           NULL,
-                                           NULL );
-
-   HYPRE_Int     *B_ext_diag_ii = hypre_TAlloc(HYPRE_Int,     B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-   HYPRE_Int     *B_ext_diag_j  = hypre_TAlloc(HYPRE_Int,     B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-   HYPRE_Complex *B_ext_diag_a  = hypre_TAlloc(HYPRE_Complex, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-
-   HYPRE_Int     *B_ext_offd_ii = hypre_TAlloc(HYPRE_Int,     B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
-   HYPRE_Int     *B_ext_offd_j  = hypre_TAlloc(HYPRE_Int,     B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
-   HYPRE_Complex *B_ext_offd_a  = hypre_TAlloc(HYPRE_Complex, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
-
-   ierr = hypre_CSRMatrixSplitDevice_core( 1,
-                                           num_rows,
-                                           B_ext_nnz,
-                                           B_ext_ii,
-                                           hypre_CSRMatrixBigJ(B_ext),
-                                           hypre_CSRMatrixData(B_ext),
-                                           NULL,
-                                           first_col_diag_B,
-                                           last_col_diag_B,
-                                           num_cols_offd_B,
-                                           col_map_offd_B,
-                                           map_B_to_C_ptr,
-                                           num_cols_offd_C_ptr,
-                                           col_map_offd_C_ptr,
-                                           &B_ext_diag_nnz,
-                                           B_ext_diag_ii,
-                                           B_ext_diag_j,
-                                           B_ext_diag_a,
-                                           NULL,
-                                           &B_ext_offd_nnz,
-                                           B_ext_offd_ii,
-                                           B_ext_offd_j,
-                                           B_ext_offd_a,
-                                           NULL );
-
-   hypre_TFree(B_ext_ii, HYPRE_MEMORY_DEVICE);
-
-   /* convert to row ptrs */
-   HYPRE_Int *B_ext_diag_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_diag_nnz, B_ext_diag_ii);
-   HYPRE_Int *B_ext_offd_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_offd_nnz, B_ext_offd_ii);
-
-   hypre_TFree(B_ext_diag_ii, HYPRE_MEMORY_DEVICE);
-   hypre_TFree(B_ext_offd_ii, HYPRE_MEMORY_DEVICE);
-
-   /* create diag and offd CSR */
-   hypre_CSRMatrix *B_ext_diag = hypre_CSRMatrixCreate(num_rows,
-                                                       last_col_diag_B - first_col_diag_B + 1, B_ext_diag_nnz);
-   hypre_CSRMatrix *B_ext_offd = hypre_CSRMatrixCreate(num_rows, *num_cols_offd_C_ptr, B_ext_offd_nnz);
-
-   hypre_CSRMatrixI(B_ext_diag) = B_ext_diag_i;
-   hypre_CSRMatrixJ(B_ext_diag) = B_ext_diag_j;
-   hypre_CSRMatrixData(B_ext_diag) = B_ext_diag_a;
-   hypre_CSRMatrixNumNonzeros(B_ext_diag) = B_ext_diag_nnz;
-   hypre_CSRMatrixMemoryLocation(B_ext_diag) = HYPRE_MEMORY_DEVICE;
-
-   hypre_CSRMatrixI(B_ext_offd) = B_ext_offd_i;
-   hypre_CSRMatrixJ(B_ext_offd) = B_ext_offd_j;
-   hypre_CSRMatrixData(B_ext_offd) = B_ext_offd_a;
-   hypre_CSRMatrixNumNonzeros(B_ext_offd) = B_ext_offd_nnz;
-   hypre_CSRMatrixMemoryLocation(B_ext_offd) = HYPRE_MEMORY_DEVICE;
-
-   *B_ext_diag_ptr = B_ext_diag;
-   *B_ext_offd_ptr = B_ext_offd;
-
-   hypre_SyncCudaComputeStream(hypre_handle());
-
-   return ierr;
-}
-
-HYPRE_Int
-hypre_CSRMatrixSplitDevice_core( HYPRE_Int
-                                 job,                 /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */
-                                 HYPRE_Int         num_rows,
-                                 HYPRE_Int         B_ext_nnz,
-                                 HYPRE_Int
-                                 *B_ext_ii,            /* Note: this is NOT row pointers as in CSR but row indices as in COO */
-                                 HYPRE_BigInt     *B_ext_bigj,          /* Note: [BigInt] global column indices */
-                                 HYPRE_Complex    *B_ext_data,
-                                 char             *B_ext_xata,          /* companion data with B_ext_data; NULL if none */
-                                 HYPRE_BigInt      first_col_diag_B,
-                                 HYPRE_BigInt      last_col_diag_B,
-                                 HYPRE_Int         num_cols_offd_B,
-                                 HYPRE_BigInt     *col_map_offd_B,
-                                 HYPRE_Int       **map_B_to_C_ptr,
-                                 HYPRE_Int        *num_cols_offd_C_ptr,
-                                 HYPRE_BigInt    **col_map_offd_C_ptr,
-                                 HYPRE_Int        *B_ext_diag_nnz_ptr,
-                                 HYPRE_Int        *B_ext_diag_ii,       /* memory allocated outside */
-                                 HYPRE_Int        *B_ext_diag_j,
-                                 HYPRE_Complex    *B_ext_diag_data,
-                                 char             *B_ext_diag_xata,     /* companion with B_ext_diag_data_ptr; NULL if none */
-                                 HYPRE_Int        *B_ext_offd_nnz_ptr,
-                                 HYPRE_Int        *B_ext_offd_ii,       /* memory allocated outside */
-                                 HYPRE_Int        *B_ext_offd_j,
-                                 HYPRE_Complex    *B_ext_offd_data,
-                                 char             *B_ext_offd_xata      /* companion with B_ext_offd_data_ptr; NULL if none */ )
-{
-   HYPRE_Int      B_ext_diag_nnz;
-   HYPRE_Int      B_ext_offd_nnz;
-   HYPRE_BigInt  *B_ext_diag_bigj = NULL;
-   HYPRE_BigInt  *B_ext_offd_bigj = NULL;
-   HYPRE_BigInt  *col_map_offd_C;
-   HYPRE_Int     *map_B_to_C = NULL;
-   HYPRE_Int      num_cols_offd_C;
-
-   in_range<HYPRE_BigInt> pred1(first_col_diag_B, last_col_diag_B);
-
-   /* get diag and offd nnz */
-   if (job == 0)
-   {
-      /* query the nnz's */
-      B_ext_diag_nnz = HYPRE_THRUST_CALL( count_if,
-                                          B_ext_bigj,
-                                          B_ext_bigj + B_ext_nnz,
-                                          pred1 );
-      B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz;
-
-      *B_ext_diag_nnz_ptr = B_ext_diag_nnz;
-      *B_ext_offd_nnz_ptr = B_ext_offd_nnz;
-
-      return hypre_error_flag;
-   }
-   else
-   {
-      B_ext_diag_nnz = *B_ext_diag_nnz_ptr;
-      B_ext_offd_nnz = *B_ext_offd_nnz_ptr;
-   }
-
-   /* copy to diag */
-   B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
-
-   if (B_ext_diag_xata)
-   {
-      auto new_end = HYPRE_THRUST_CALL(
-                        copy_if,
-                        thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,
-                                                                     B_ext_xata)),             /* first */
-                        thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,
-                                                                     B_ext_xata)) + B_ext_nnz, /* last */
-                        B_ext_bigj,                                                                                                          /* stencil */
-                        thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data,
-                                                                     B_ext_diag_xata)),     /* result */
-                        pred1 );
-
-      hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz );
-   }
-   else
-   {
-      auto new_end = HYPRE_THRUST_CALL(
-                        copy_if,
-                        thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,
-                                                                     B_ext_data)),             /* first */
-                        thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,
-                                                                     B_ext_data)) + B_ext_nnz, /* last */
-                        B_ext_bigj,                                                                                            /* stencil */
-                        thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj,
-                                                                     B_ext_diag_data)),        /* result */
-                        pred1 );
-
-      hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz );
-   }
-
-   HYPRE_THRUST_CALL( transform,
-                      B_ext_diag_bigj,
-                      B_ext_diag_bigj + B_ext_diag_nnz,
-                      thrust::make_constant_iterator(first_col_diag_B),
-                      B_ext_diag_j,
-                      thrust::minus<HYPRE_BigInt>());
-
-   hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE);
-
-   /* copy to offd */
-   B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
-
-   if (B_ext_offd_xata)
-   {
-      auto new_end = HYPRE_THRUST_CALL(
-                        copy_if,
-                        thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,
-                                                                     B_ext_xata)),             /* first */
-                        thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,      B_ext_data,
-                                                                     B_ext_xata)) + B_ext_nnz, /* last */
-                        B_ext_bigj,                                                                                                          /* stencil */
-                        thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data,
-                                                                     B_ext_offd_xata)),     /* result */
-                        thrust::not1(pred1) );
-
-      hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
-   }
-   else
-   {
-      auto new_end = HYPRE_THRUST_CALL(
-                        copy_if,
-                        thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,
-                                                                     B_ext_data)),             /* first */
-                        thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii,      B_ext_bigj,
-                                                                     B_ext_data)) + B_ext_nnz, /* last */
-                        B_ext_bigj,                                                                                            /* stencil */
-                        thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj,
-                                                                     B_ext_offd_data)),        /* result */
-                        thrust::not1(pred1) );
-
-      hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz );
-   }
-
-   /* offd map of B_ext_offd Union col_map_offd_B */
-   col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE);
-   hypre_TMemcpy(col_map_offd_C,                  B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz,
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-   hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B,  HYPRE_BigInt, num_cols_offd_B,
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-
-   HYPRE_THRUST_CALL( sort,
-                      col_map_offd_C,
-                      col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
-
-   HYPRE_BigInt *new_end = HYPRE_THRUST_CALL( unique,
-                                              col_map_offd_C,
-                                              col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B );
-
-   num_cols_offd_C = new_end - col_map_offd_C;
-
-#if 1
-   HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
-   hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE,
-                 HYPRE_MEMORY_DEVICE);
-   hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE);
-   col_map_offd_C = tmp;
-#else
-   col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B,
-                                      HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE);
-#endif
-
-   /* create map from col_map_offd_B */
-   if (num_cols_offd_B)
-   {
-      map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE);
-      HYPRE_THRUST_CALL( lower_bound,
-                         col_map_offd_C,
-                         col_map_offd_C + num_cols_offd_C,
-                         col_map_offd_B,
-                         col_map_offd_B + num_cols_offd_B,
-                         map_B_to_C );
-   }
-
-   HYPRE_THRUST_CALL( lower_bound,
-                      col_map_offd_C,
-                      col_map_offd_C + num_cols_offd_C,
-                      B_ext_offd_bigj,
-                      B_ext_offd_bigj + B_ext_offd_nnz,
-                      B_ext_offd_j );
-
-   hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE);
-
-   if (map_B_to_C_ptr)
-   {
-      *map_B_to_C_ptr   = map_B_to_C;
-   }
-   *num_cols_offd_C_ptr = num_cols_offd_C;
-   *col_map_offd_C_ptr  = col_map_offd_C;
-
-   return hypre_error_flag;
+  HYPRE_Int num_rows = hypre_CSRMatrixNumRows(B_ext);
+  HYPRE_Int B_ext_nnz = hypre_CSRMatrixNumNonzeros(B_ext);
+
+  HYPRE_Int *B_ext_ii = hypre_TAlloc(HYPRE_Int, B_ext_nnz, HYPRE_MEMORY_DEVICE);
+  hypreDevice_CsrRowPtrsToIndices_v2(num_rows, B_ext_nnz, hypre_CSRMatrixI(B_ext), B_ext_ii);
+
+  HYPRE_Int B_ext_diag_nnz;
+  HYPRE_Int B_ext_offd_nnz;
+  HYPRE_Int ierr;
+
+  ierr = hypre_CSRMatrixSplitDevice_core( 0,
+                                          num_rows,
+                                          B_ext_nnz,
+                                          NULL,
+                                          hypre_CSRMatrixBigJ(B_ext),
+                                          NULL,
+                                          NULL,
+                                          first_col_diag_B,
+                                          last_col_diag_B,
+                                          num_cols_offd_B,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          &B_ext_diag_nnz,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          &B_ext_offd_nnz,
+                                          NULL,
+                                          NULL,
+                                          NULL,
+                                          NULL );
+
+  HYPRE_Int     *B_ext_diag_ii = hypre_TAlloc(HYPRE_Int,     B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int     *B_ext_diag_j  = hypre_TAlloc(HYPRE_Int,     B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_Complex *B_ext_diag_a  = hypre_TAlloc(HYPRE_Complex, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE);
+
+  HYPRE_Int     *B_ext_offd_ii = hypre_TAlloc(HYPRE_Int,     B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int     *B_ext_offd_j  = hypre_TAlloc(HYPRE_Int,     B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
+  HYPRE_Complex *B_ext_offd_a  = hypre_TAlloc(HYPRE_Complex, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE);
+
+  ierr = hypre_CSRMatrixSplitDevice_core( 1,
+                                          num_rows,
+                                          B_ext_nnz,
+                                          B_ext_ii,
+                                          hypre_CSRMatrixBigJ(B_ext),
+                                          hypre_CSRMatrixData(B_ext),
+                                          NULL,
+                                          first_col_diag_B,
+                                          last_col_diag_B,
+                                          num_cols_offd_B,
+                                          col_map_offd_B,
+                                          map_B_to_C_ptr,
+                                          num_cols_offd_C_ptr,
+                                          col_map_offd_C_ptr,
+                                          &B_ext_diag_nnz,
+                                          B_ext_diag_ii,
+                                          B_ext_diag_j,
+                                          B_ext_diag_a,
+                                          NULL,
+                                          &B_ext_offd_nnz,
+                                          B_ext_offd_ii,
+                                          B_ext_offd_j,
+                                          B_ext_offd_a,
+                                          NULL );
+
+  hypre_TFree(B_ext_ii, HYPRE_MEMORY_DEVICE);
+
+  /* convert to row ptrs */
+  HYPRE_Int *B_ext_diag_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_diag_nnz, B_ext_diag_ii);
+  HYPRE_Int *B_ext_offd_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_offd_nnz, B_ext_offd_ii);
+
+  hypre_TFree(B_ext_diag_ii, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(B_ext_offd_ii, HYPRE_MEMORY_DEVICE);
+
+  /* create diag and offd CSR */
+  hypre_CSRMatrix *B_ext_diag = hypre_CSRMatrixCreate(num_rows, last_col_diag_B - first_col_diag_B + 1, B_ext_diag_nnz);
+  hypre_CSRMatrix *B_ext_offd = hypre_CSRMatrixCreate(num_rows, *num_cols_offd_C_ptr, B_ext_offd_nnz);
+
+  hypre_CSRMatrixI(B_ext_diag) = B_ext_diag_i;
+  hypre_CSRMatrixJ(B_ext_diag) = B_ext_diag_j;
+  hypre_CSRMatrixData(B_ext_diag) = B_ext_diag_a;
+  hypre_CSRMatrixNumNonzeros(B_ext_diag) = B_ext_diag_nnz;
+  hypre_CSRMatrixMemoryLocation(B_ext_diag) = HYPRE_MEMORY_DEVICE;
+
+  hypre_CSRMatrixI(B_ext_offd) = B_ext_offd_i;
+  hypre_CSRMatrixJ(B_ext_offd) = B_ext_offd_j;
+  hypre_CSRMatrixData(B_ext_offd) = B_ext_offd_a;
+  hypre_CSRMatrixNumNonzeros(B_ext_offd) = B_ext_offd_nnz;
+  hypre_CSRMatrixMemoryLocation(B_ext_offd) = HYPRE_MEMORY_DEVICE;
+
+  *B_ext_diag_ptr = B_ext_diag;
+  *B_ext_offd_ptr = B_ext_offd;
+
+  hypre_SyncDeviceComputeStream(hypre_handle());
+
+  return ierr;
 }
 
 /*--------------------------------------------------------------------------
@@ -541,149 +741,168 @@ hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A,
                                  hypre_CSRMatrix *B,
                                  HYPRE_Int       *row_nums)
 {
-   HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
-   HYPRE_Int        *A_i      = hypre_CSRMatrixI(A);
-   HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
-   HYPRE_Int         nrows_A  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
-   HYPRE_Int         nnz_A    = hypre_CSRMatrixNumNonzeros(A);
-   HYPRE_Complex    *B_data   = hypre_CSRMatrixData(B);
-   HYPRE_Int        *B_i      = hypre_CSRMatrixI(B);
-   HYPRE_Int        *B_j      = hypre_CSRMatrixJ(B);
-   HYPRE_Int         nrows_B  = hypre_CSRMatrixNumRows(B);
-   HYPRE_Int         ncols_B  = hypre_CSRMatrixNumCols(B);
-   HYPRE_Int         nnz_B    = hypre_CSRMatrixNumNonzeros(B);
-   HYPRE_Complex    *C_data;
-   HYPRE_Int        *C_i;
-   HYPRE_Int        *C_j;
-   HYPRE_Int         nnzC;
-   hypre_CSRMatrix  *C;
-
-   if (ncols_A != ncols_B)
-   {
-      hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! incompatible matrix dimensions!\n");
-
-      return NULL;
-   }
-
-   hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B, A_i, A_j, 1.0, A_data, NULL, B_i, B_j,
-                        1.0, B_data, NULL, row_nums,
-                        &nnzC, &C_i, &C_j, &C_data);
-
-   C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC);
-   hypre_CSRMatrixI(C) = C_i;
-   hypre_CSRMatrixJ(C) = C_j;
-   hypre_CSRMatrixData(C) = C_data;
-   hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
-
-   hypre_SyncCudaComputeStream(hypre_handle());
-
-   return C;
+  HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
+  HYPRE_Int        *A_i      = hypre_CSRMatrixI(A);
+  HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
+  HYPRE_Int         nrows_A  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
+  HYPRE_Int         nnz_A    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Complex    *B_data   = hypre_CSRMatrixData(B);
+  HYPRE_Int        *B_i      = hypre_CSRMatrixI(B);
+  HYPRE_Int        *B_j      = hypre_CSRMatrixJ(B);
+  HYPRE_Int         nrows_B  = hypre_CSRMatrixNumRows(B);
+  HYPRE_Int         ncols_B  = hypre_CSRMatrixNumCols(B);
+  HYPRE_Int         nnz_B    = hypre_CSRMatrixNumNonzeros(B);
+  HYPRE_Complex    *C_data;
+  HYPRE_Int        *C_i;
+  HYPRE_Int        *C_j;
+  HYPRE_Int         nnzC;
+  hypre_CSRMatrix  *C;
+
+  if (ncols_A != ncols_B)
+  {
+    hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! Incompatible matrix dimensions!\n");
+
+    return NULL;
+  }
+
+  hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B, A_i, A_j, 1.0, A_data, NULL, B_i, B_j, 1.0, B_data, NULL, row_nums,
+                       &nnzC, &C_i, &C_j, &C_data);
+
+  C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC);
+  hypre_CSRMatrixI(C) = C_i;
+  hypre_CSRMatrixJ(C) = C_j;
+  hypre_CSRMatrixData(C) = C_data;
+  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
+
+  hypre_SyncDeviceComputeStream(hypre_handle());
+
+  return C;
 }
 
 HYPRE_Int
 hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix  *A,
                                  HYPRE_Real       *colnnz)
 {
-   HYPRE_Int *A_j      = hypre_CSRMatrixJ(A);
-   HYPRE_Int  ncols_A  = hypre_CSRMatrixNumCols(A);
-   HYPRE_Int  nnz_A    = hypre_CSRMatrixNumNonzeros(A);
-   HYPRE_Int *A_j_sorted;
-   HYPRE_Int  num_reduced_col_indices;
-   HYPRE_Int *reduced_col_indices;
-   HYPRE_Int *reduced_col_nnz;
-
-   A_j_sorted = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE);
-   hypre_TMemcpy(A_j_sorted, A_j, HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-   HYPRE_THRUST_CALL(sort, A_j_sorted, A_j_sorted + nnz_A);
-
-   reduced_col_indices = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
-   reduced_col_nnz     = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int *A_j      = hypre_CSRMatrixJ(A);
+  HYPRE_Int  ncols_A  = hypre_CSRMatrixNumCols(A);
+  HYPRE_Int  nnz_A    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Int *A_j_sorted;
+  HYPRE_Int  num_reduced_col_indices;
+  HYPRE_Int *reduced_col_indices;
+  HYPRE_Int *reduced_col_nnz;
+  reduced_col_indices = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
+  reduced_col_nnz     = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE);
+
+  A_j_sorted = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(A_j_sorted, A_j, HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+#ifdef HYPRE_USING_SYCL
+  HYPRE_ONEDPL_CALL(std::sort, A_j_sorted, A_j_sorted + nnz_A);
+
+  HYPRE_Int* values = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_UNIFIED);
+  hypre_HandleComputeStream(hypre_handle())->fill(values, 1, nnz_A*sizeof(HYPRE_Int)).wait();
+  std::pair<HYPRE_Int*, HYPRE_Int*> new_end =
+    HYPRE_ONEDPL_CALL( oneapi::dpl::reduce_by_segment, A_j_sorted, A_j_sorted + nnz_A,
+                       values,
+                       reduced_col_indices,
+                       reduced_col_nnz );
+
+  hypre_TFree(values,              HYPRE_MEMORY_UNIFIED);
+#else
+  HYPRE_THRUST_CALL(sort, A_j_sorted, A_j_sorted + nnz_A);
 
-   thrust::pair<HYPRE_Int*, HYPRE_Int*> new_end =
-      HYPRE_THRUST_CALL(reduce_by_key, A_j_sorted, A_j_sorted + nnz_A,
-                        thrust::make_constant_iterator(1),
-                        reduced_col_indices,
-                        reduced_col_nnz);
+  thrust::pair<HYPRE_Int*, HYPRE_Int*> new_end =
+    HYPRE_THRUST_CALL(reduce_by_key, A_j_sorted, A_j_sorted + nnz_A,
+                      thrust::make_constant_iterator(1),
+                      reduced_col_indices,
+                      reduced_col_nnz);
+#endif
 
-   hypre_assert(new_end.first - reduced_col_indices == new_end.second - reduced_col_nnz);
+  hypre_assert(new_end.first - reduced_col_indices == new_end.second - reduced_col_nnz);
 
-   num_reduced_col_indices = new_end.first - reduced_col_indices;
+  num_reduced_col_indices = new_end.first - reduced_col_indices;
 
-   hypre_Memset(colnnz, 0, ncols_A * sizeof(HYPRE_Real), HYPRE_MEMORY_DEVICE);
-   HYPRE_THRUST_CALL(scatter, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices,
-                     reduced_col_indices, colnnz);
+  hypre_Memset(colnnz, 0, ncols_A * sizeof(HYPRE_Real), HYPRE_MEMORY_DEVICE);
+#ifdef HYPRE_USING_SYCL
+  HYPRE_ONEDPL_CALL( oneapi::dpl::copy, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices,
+                     oneapi::dpl::make_permutation_iterator(colnnz, reduced_col_indices) );
+#else
+  HYPRE_THRUST_CALL(scatter, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices,
+                    reduced_col_indices, colnnz);
+#endif
 
-   hypre_TFree(A_j_sorted,          HYPRE_MEMORY_DEVICE);
-   hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE);
-   hypre_TFree(reduced_col_nnz,     HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_j_sorted,          HYPRE_MEMORY_DEVICE);
+  hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(reduced_col_nnz,     HYPRE_MEMORY_DEVICE);
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
-   return hypre_error_flag;
+  return hypre_error_flag;
 }
 
 __global__ void
-hypreCUDAKernel_CSRMoveDiagFirst( HYPRE_Int      nrows,
-                                  HYPRE_Int     *ia,
-                                  HYPRE_Int     *ja,
-                                  HYPRE_Complex *aa )
+hypreGPUKernel_CSRMoveDiagFirst(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+  #endif
+  HYPRE_Int      nrows,
+  HYPRE_Int     *ia,
+  HYPRE_Int     *ja,
+  HYPRE_Complex *aa )
 {
-   HYPRE_Int row = hypre_cuda_get_grid_warp_id<1, 1>();
-
-   if (row >= nrows)
-   {
-      return;
-   }
-
-   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-   HYPRE_Int p = 0, q = 0;
-
-   if (lane < 2)
-   {
-      p = read_only_load(ia + row + lane);
-   }
-   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
-
-   for (HYPRE_Int j = p + lane + 1; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
-   {
-      hypre_int find_diag = j < q && ja[j] == row;
-
-      if (find_diag)
-      {
-         ja[j] = ja[p];
-         ja[p] = row;
-         HYPRE_Complex tmp = aa[p];
-         aa[p] = aa[j];
-         aa[j] = tmp;
-      }
-
-      if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
-      {
-         break;
-      }
-   }
-}
+#ifdef HYPRE_USING_SYCL
+  HYPRE_Int row  = hypre_gpu_get_grid_warp_id<1,1>(item);
+  HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item);
+  sycl::sub_group SG = item.get_sub_group();
+#else
+  HYPRE_Int row  = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
+#endif
 
-HYPRE_Int
-hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
-{
-   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-   dim3           bDim, gDim;
+  if (row >= nrows)
+  {
+    return;
+  }
 
-   bDim = hypre_GetDefaultDeviceBlockDimension();
-   gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
+  HYPRE_Int p = 0, q = 0;
 
-   HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMoveDiagFirst, gDim, bDim,
-                     nrows, A_i, A_j, A_data);
+  if (lane < 2)
+  {
+    p = read_only_load(ia + row + lane);
+  }
+#ifdef HYPRE_USING_SYCL
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+  for (HYPRE_Int j = p + lane + 1; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0))
+#else
+  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
 
-   return hypre_error_flag;
+  for (HYPRE_Int j = p + lane + 1; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+#endif
+  {
+    hypre_int find_diag = j < q && ja[j] == row;
+
+    if (find_diag)
+    {
+      ja[j] = ja[p];
+      ja[p] = row;
+      HYPRE_Complex tmp = aa[p];
+      aa[p] = aa[j];
+      aa[j] = tmp;
+    }
+
+#ifdef HYPRE_USING_SYCL
+    if ( sycl::any_of_group(SG, find_diag) )
+#else
+    if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+#endif
+    {
+      break;
+    }
+  }
 }
 
 /* check if diagonal entry is the first one at each row
@@ -691,503 +910,729 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
  * RL: only check if it's a non-empty row
  */
 __global__ void
-hypreCUDAKernel_CSRCheckDiagFirst( HYPRE_Int  nrows,
-                                   HYPRE_Int *ia,
-                                   HYPRE_Int *ja,
-                                   HYPRE_Int *result )
+hypreGPUKernel_CSRCheckDiagFirst(
+#ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+#endif
+  HYPRE_Int  nrows,
+  HYPRE_Int *ia,
+  HYPRE_Int *ja,
+  HYPRE_Int *result )
 {
-   const HYPRE_Int row = hypre_cuda_get_grid_thread_id<1, 1>();
-   if (row < nrows)
-   {
-      result[row] = (ia[row + 1] > ia[row]) && (ja[ia[row]] != row);
-   }
+#ifdef HYPRE_USING_SYCL
+  const HYPRE_Int row = hypre_gpu_get_grid_thread_id<1,1>(item);
+#else
+  const HYPRE_Int row = hypre_cuda_get_grid_thread_id<1,1>();
+#endif
+  if (row < nrows)
+  {
+    result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row);
+  }
 }
 
-HYPRE_Int
-hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
+__global__ void
+hypreGPUKernel_CSRMatrixFixZeroDiagDevice(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+  #endif
+  HYPRE_Complex  v,
+  HYPRE_Int      nrows,
+  HYPRE_Int     *ia,
+  HYPRE_Int     *ja,
+  HYPRE_Complex *data,
+  HYPRE_Real     tol,
+  HYPRE_Int     *result )
 {
-   if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
-   {
-      return 0;
-   }
+#ifdef HYPRE_USING_SYCL
+  const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item);
+  HYPRE_Int lane      = hypre_gpu_get_lane_id<1>(item);
+  sycl::sub_group SG  = item.get_sub_group();
+#else
+  const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_Int lane      = hypre_cuda_get_lane_id<1>();
+#endif
+
+  if (row >= nrows)
+  {
+    return;
+  }
 
-   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
-   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim);
+  HYPRE_Int p = 0, q = 0;
+  bool has_diag = false;
 
-   HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRCheckDiagFirst, gDim, bDim,
-                      hypre_CSRMatrixNumRows(A),
-                      hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result );
+  if (lane < 2)
+  {
+    p = read_only_load(ia + row + lane);
+  }
 
-   HYPRE_Int ierr = HYPRE_THRUST_CALL( reduce,
-                                       result,
-                                       result + hypre_CSRMatrixNumRows(A) );
+#ifdef HYPRE_USING_SYCL
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
+
+  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0))
+#else
+  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
 
-   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+  for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+#endif
+  {
+    hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+    if (find_diag)
+    {
+      if (fabs(data[j]) <= tol)
+      {
+        data[j] = v;
+      }
+    }
 
-   return ierr;
+#ifdef HYPRE_USING_SYCL
+    if ( sycl::any_of_group(SG, find_diag) )
+#else
+    if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+#endif
+    {
+      has_diag = true;
+      break;
+    }
+  }
+
+  if (result && !has_diag && lane == 0)
+  {
+    result[row] = 1;
+  }
 }
 
 __global__ void
-hypreCUDAKernel_CSRMatrixFixZeroDiagDevice( HYPRE_Complex  v,
-                                            HYPRE_Int      nrows,
-                                            HYPRE_Int     *ia,
-                                            HYPRE_Int     *ja,
-                                            HYPRE_Complex *data,
-                                            HYPRE_Real     tol,
-                                            HYPRE_Int     *result )
+hypreGPUKernel_CSRMatrixReplaceDiagDevice(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+  #endif
+  HYPRE_Complex *new_diag,
+  HYPRE_Complex  v,
+  HYPRE_Int      nrows,
+  HYPRE_Int     *ia,
+  HYPRE_Int     *ja,
+  HYPRE_Complex *data,
+  HYPRE_Real     tol,
+  HYPRE_Int     *result )
 {
-   const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1, 1>();
-
-   if (row >= nrows)
-   {
-      return;
-   }
-
-   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-   HYPRE_Int p = 0, q = 0;
-   bool has_diag = false;
-
-   if (lane < 2)
-   {
-      p = read_only_load(ia + row + lane);
-   }
-   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
-
-   for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
-   {
-      hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
+#ifdef HYPRE_USING_SYCL
+  const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item);
+  HYPRE_Int lane      = hypre_gpu_get_lane_id<1>(item);
+  sycl::sub_group SG  = item.get_sub_group();
+#else
+  const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_Int lane      = hypre_cuda_get_lane_id<1>();
+#endif
+  if (row >= nrows)
+  {
+    return;
+  }
+
+  HYPRE_Int p = 0, q = 0;
+  bool has_diag = false;
+
+  if (lane < 2)
+  {
+    p = read_only_load(ia + row + lane);
+  }
+#ifdef HYPRE_USING_SYCL
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
+
+  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0))
+#else
+  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
 
-      if (find_diag)
-      {
-         if (fabs(data[j]) <= tol)
-         {
-            data[j] = v;
-         }
-      }
+  for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+#endif
+  {
+    hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
 
-      if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+    if (find_diag)
+    {
+      HYPRE_Complex d = read_only_load(&new_diag[row]);
+      if (fabs(d) <= tol)
       {
-         has_diag = true;
-         break;
+        d = v;
       }
-   }
+      data[j] = d;
+    }
 
-   if (result && !has_diag && lane == 0)
-   {
-      result[row] = 1;
-   }
+#ifdef HYPRE_USING_SYCL
+    if ( sycl::any_of_group(SG, find_diag) )
+#else
+    if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+#endif
+    {
+      has_diag = true;
+      break;
+    }
+  }
+
+  if (result && !has_diag && lane == 0)
+  {
+    result[row] = 1;
+  }
 }
 
-/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v
- * Does NOT assume diagonal is the first entry of each row of A
- * In debug mode:
- *    Returns the number of rows that do not have diag in the pattern
- *    (i.e., structural zeroes on the diagonal)
+/* type == 0, sum,
+ *         1, abs sum (l-1)
+ *         2, square sum (l-2)
  */
-HYPRE_Int
-hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
-                                  HYPRE_Complex    v,
-                                  HYPRE_Real       tol )
+template<HYPRE_Int type>
+__global__ void
+hypreGPUKernel_CSRRowSum(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+  #endif
+  HYPRE_Int      nrows,
+  HYPRE_Int     *ia,
+  HYPRE_Int     *ja,
+  HYPRE_Complex *aa,
+  HYPRE_Int     *CF_i,
+  HYPRE_Int     *CF_j,
+  HYPRE_Complex *row_sum,
+  HYPRE_Complex  scal,
+  HYPRE_Int      set)
 {
-   HYPRE_Int ierr = 0;
-
-   if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
-   {
-      return ierr;
-   }
-
-   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
-   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
-
-#if HYPRE_DEBUG
-   HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+#ifdef HYPRE_USING_SYCL
+  HYPRE_Int row_i    = hypre_gpu_get_grid_warp_id<1,1>(item);
+  HYPRE_Int lane     = hypre_gpu_get_lane_id<1>(item);
+  sycl::sub_group SG = item.get_sub_group();
 #else
-   HYPRE_Int *result = NULL;
+  HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_Int lane  = hypre_cuda_get_lane_id<1>();
 #endif
+  if (row_i >= nrows)
+  {
+    return;
+  }
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim,
-                      v, hypre_CSRMatrixNumRows(A),
-                      hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
-                      tol, result );
+  HYPRE_Int p = 0, q = 0;
 
-#if HYPRE_DEBUG
-   ierr = HYPRE_THRUST_CALL( reduce,
-                             result,
-                             result + hypre_CSRMatrixNumRows(A) );
+  if (lane < 2)
+  {
+    p = read_only_load(ia + row_i + lane);
+  }
 
-   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
-#endif
+  HYPRE_Complex row_sum_i = 0.0;
+
+#ifdef HYPRE_USING_SYCL
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0))
+#else
+  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
 
-   return ierr;
+  for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+#endif
+  {
+    if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) )
+    {
+      continue;
+    }
+
+    HYPRE_Complex aii = aa[j];
+
+    if (type == 0)
+    {
+      row_sum_i += aii;
+    }
+    else if (type == 1)
+    {
+      row_sum_i += fabs(aii);
+    }
+    else if (type == 2)
+    {
+      row_sum_i += aii * aii;
+    }
+  }
+#ifdef HYPRE_USING_SYCL
+  row_sum_i = warp_reduce_sum(row_sum_i, item);
+#else
+  row_sum_i = warp_reduce_sum(row_sum_i);
+#endif
+  if (lane == 0)
+  {
+    if (set)
+    {
+      row_sum[row_i] = scal * row_sum_i;
+    }
+    else
+    {
+      row_sum[row_i] += scal * row_sum_i;
+    }
+  }
 }
 
+/* type 0: diag
+ *      1: abs diag
+ *      2: diag inverse
+ *      3: diag inverse sqrt
+ *      4: abs diag inverse sqrt
+ */
 __global__ void
-hypreCUDAKernel_CSRMatrixReplaceDiagDevice( HYPRE_Complex *new_diag,
-                                            HYPRE_Complex  v,
-                                            HYPRE_Int      nrows,
-                                            HYPRE_Int     *ia,
-                                            HYPRE_Int     *ja,
-                                            HYPRE_Complex *data,
-                                            HYPRE_Real     tol,
-                                            HYPRE_Int     *result )
+hypreGPUKernel_CSRExtractDiag(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+  #endif
+  HYPRE_Int      nrows,
+  HYPRE_Int     *ia,
+  HYPRE_Int     *ja,
+  HYPRE_Complex *aa,
+  HYPRE_Complex *d,
+  HYPRE_Int      type)
 {
-   const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1, 1>();
+#ifdef HYPRE_USING_SYCL
+  HYPRE_Int row      = hypre_gpu_get_grid_warp_id<1,1>(item);
+  HYPRE_Int lane     = hypre_gpu_get_lane_id<1>(item);
+  sycl::sub_group SG = item.get_sub_group();
+#else
+  HYPRE_Int row      = hypre_cuda_get_grid_warp_id<1,1>();
+  HYPRE_Int lane     = hypre_cuda_get_lane_id<1>();
+#endif
+  if (row >= nrows)
+  {
+    return;
+  }
+
+  HYPRE_Int p = 0, q = 0;
+
+  if (lane < 2)
+  {
+    p = read_only_load(ia + row + lane);
+  }
+  HYPRE_Int has_diag = 0;
+#ifdef HYPRE_USING_SYCL
+  q = SG.shuffle(p, 1);
+  p = SG.shuffle(p, 0);
+
+  for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0))
+#else
+  q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
+  p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
 
-   if (row >= nrows)
-   {
-      return;
-   }
+  for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
+#endif
+  {
+    hypre_int find_diag = j < q && ja[j] == row;
 
-   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-   HYPRE_Int p = 0, q = 0;
-   bool has_diag = false;
+    if (find_diag)
+    {
+      if (type == 0)
+      {
+        d[row] = aa[j];
+      }
+      else if (type == 1)
+      {
+        d[row] = fabs(aa[j]);
+      }
+      else if (type == 2)
+      {
+        d[row] = 1.0 / aa[j];
+      }
+      else if (type == 3)
+      {
+        d[row] = 1.0 / sqrt(aa[j]);
+      }
+      else if (type == 4)
+      {
+        d[row] = 1.0 / sqrt(fabs(aa[j]));
+      }
+    }
 
-   if (lane < 2)
-   {
-      p = read_only_load(ia + row + lane);
-   }
-   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+#ifdef HYPRE_USING_SYCL
+    if ( sycl::any_of_group(SG, find_diag) )
+#else
+    if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+#endif
+    {
+      has_diag = 1;
+      break;
+    }
+  }
+
+  if (!has_diag && lane == 0)
+  {
+    d[row] = 0.0;
+  }
+}
 
-   for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
-   {
-      hypre_int find_diag = j < q && read_only_load(&ja[j]) == row;
+/* mark is of size nA
+ * diag_option: 1: special treatment for diag entries, mark as -2
+ */
+__global__ void
+hypreGPUKernel_CSRMatrixIntersectPattern(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1>& item,
+  #endif
+  HYPRE_Int  n,
+  HYPRE_Int  nA,
+  HYPRE_Int *rowid,
+  HYPRE_Int *colid,
+  HYPRE_Int *idx,
+  HYPRE_Int *mark,
+  HYPRE_Int  diag_option)
+{
+#ifdef HYPRE_USING_SYCL
+  HYPRE_Int i = hypre_gpu_get_grid_thread_id<1,1>(item);
+#else
+  HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>();
+#endif
 
-      if (find_diag)
+  if (i >= n)
+  {
+    return;
+  }
+
+  HYPRE_Int r1 = read_only_load(&rowid[i]);
+  HYPRE_Int c1 = read_only_load(&colid[i]);
+  HYPRE_Int j = read_only_load(&idx[i]);
+
+  if (0 == diag_option)
+  {
+    if (j < nA)
+    {
+      HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
+      HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
+      if (r1 == r2 && c1 == c2)
       {
-         HYPRE_Complex d = read_only_load(&new_diag[row]);
-         if (fabs(d) <= tol)
-         {
-            d = v;
-         }
-         data[j] = d;
+        mark[j] = c1;
       }
-
-      if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
+      else
       {
-         has_diag = true;
-         break;
+        mark[j] = -1;
       }
-   }
-
-   if (result && !has_diag && lane == 0)
-   {
-      result[row] = 1;
-   }
+    }
+  }
+  else if (1 == diag_option)
+  {
+    if (j < nA)
+    {
+      if (r1 == c1)
+      {
+        mark[j] = -2;
+      }
+      else
+      {
+        HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
+        HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
+        if (r1 == r2 && c1 == c2)
+        {
+          mark[j] = c1;
+        }
+        else
+        {
+          mark[j] = -1;
+        }
+      }
+    }
+  }
 }
 
+/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v
+ * Does NOT assume diagonal is the first entry of each row of A
+ * In debug mode:
+ *    Returns the number of rows that do not have diag in the pattern
+ *    (i.e., structural zeroes on the diagonal)
+ */
 HYPRE_Int
-hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
-                                  HYPRE_Complex   *new_diag,
+hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A,
                                   HYPRE_Complex    v,
                                   HYPRE_Real       tol )
 {
-   HYPRE_Int ierr = 0;
+  HYPRE_Int ierr = 0;
 
-   if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
-   {
-      return ierr;
-   }
+  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+  {
+    return ierr;
+  }
 
-   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
-   dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
 
 #if HYPRE_DEBUG
-   HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+  HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
 #else
-   HYPRE_Int *result = NULL;
+  HYPRE_Int *result = NULL;
 #endif
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
-                      new_diag, v, hypre_CSRMatrixNumRows(A),
-                      hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
-                      tol, result );
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim,
+                    v, hypre_CSRMatrixNumRows(A),
+                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
+                    tol, result );
 
 #if HYPRE_DEBUG
-   ierr = HYPRE_THRUST_CALL( reduce,
-                             result,
-                             result + hypre_CSRMatrixNumRows(A) );
-
-   hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+#if defined(HYPRE_USING_CUDA)
+  ierr = HYPRE_THRUST_CALL( reduce,
+                            result,
+                            result + hypre_CSRMatrixNumRows(A) );
+#elif defined(HYPRE_USING_SYCL)
+  ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
+                            result,
+                            result + hypre_CSRMatrixNumRows(A) );
 #endif
+  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+#endif // HYPRE_DEBUG
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
-   return ierr;
+  return ierr;
 }
 
-typedef thrust::tuple<HYPRE_Int, HYPRE_Int> Int2;
-struct Int2Unequal : public thrust::unary_function<Int2, bool>
-{
-   __host__ __device__
-   bool operator()(const Int2& t) const
-   {
-      return (thrust::get<0>(t) != thrust::get<1>(t));
-   }
-};
-
 HYPRE_Int
-hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A)
+hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A,
+                                  HYPRE_Complex   *new_diag,
+                                  HYPRE_Complex    v,
+                                  HYPRE_Real       tol )
 {
-   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
-   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-   HYPRE_Int     *A_ii   = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
-   HYPRE_Int      new_nnz;
-   HYPRE_Int     *new_ii;
-   HYPRE_Int     *new_j;
-   HYPRE_Complex *new_data;
-
-   new_nnz = HYPRE_THRUST_CALL( count_if,
-                                thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
-                                thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz,
-                                Int2Unequal() );
-
-   if (new_nnz == nnz)
-   {
-      /* no diagonal entries found */
-      hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
-      return hypre_error_flag;
-   }
+  HYPRE_Int ierr = 0;
 
-   new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-   new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+  {
+    return ierr;
+  }
 
-   if (A_data)
-   {
-      new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim);
 
-      thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*, HYPRE_Complex*> > new_end;
+#if HYPRE_DEBUG
+  HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+#else
+  HYPRE_Int *result = NULL;
+#endif
 
-      new_end = HYPRE_THRUST_CALL( copy_if,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
-                                   Int2Unequal() );
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim,
+                    new_diag, v, hypre_CSRMatrixNumRows(A),
+                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A),
+                    tol, result );
 
-      hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
-   }
-   else
-   {
-      new_data = NULL;
+#if HYPRE_DEBUG
+#if defined(HYPRE_USING_CUDA)
+  ierr = HYPRE_THRUST_CALL( reduce,
+                            result,
+                            result + hypre_CSRMatrixNumRows(A) );
+#elif defined(HYPRE_USING_SYCL)
+  ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
+                            result,
+                            result + hypre_CSRMatrixNumRows(A) );
+#endif
+  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
+#endif // HYPRE_DEBUG
 
-      thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*> > new_end;
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
-      new_end = HYPRE_THRUST_CALL( copy_if,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j)),
-                                   Int2Unequal() );
+  return ierr;
+}
 
-      hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
-   }
+HYPRE_Int
+hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A)
+{
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_ii   = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
+  HYPRE_Int      new_nnz;
+  HYPRE_Int     *new_ii;
+  HYPRE_Int     *new_j;
+  HYPRE_Complex *new_data;
+
+#ifdef HYPRE_USING_SYCL
+  auto zipped_begin = oneapi::dpl::make_zip_iterator(A_ii, A_j);
+  new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
+                               zipped_begin, zipped_begin + nnz,
+                               [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
+#else
+  new_nnz = HYPRE_THRUST_CALL( count_if,
+                               thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
+                               thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz,
+                               Int2Unequal() );
+#endif
+
+  if (new_nnz == nnz)
+  {
+    /* no diagonal entries found */
+    hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
+    return hypre_error_flag;
+  }
+
+  new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+  new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+
+  if (A_data)
+  {
+    new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
+
+#ifdef HYPRE_USING_SYCL
+    auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first, first + nnz,
+                                      oneapi::dpl::make_zip_iterator(A_ii, A_j),
+                                      oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data),
+                                      [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
+    // todo: fix this
+    // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz );
+#else
+    thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*, HYPRE_Complex*> > new_end;
+    new_end = HYPRE_THRUST_CALL( copy_if,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
+                                 Int2Unequal() );
+
+    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
+#endif
+  }
+  else
+  {
+    new_data = NULL;
+#ifdef HYPRE_USING_SYCL
+    auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j);
+    auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                      first, first + nnz,
+                                      first,
+                                      oneapi::dpl::make_zip_iterator(new_ii, new_j),
+                                      [](auto t) { return std::get<0>(t) != std::get<1>(t); } );
+    // TODO: abb fix this
+    // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz );
+#else
+    thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*> > new_end;
+    new_end = HYPRE_THRUST_CALL( copy_if,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j)),
+                                 Int2Unequal() );
+
+    hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
+#endif
+  }
 
-   hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
-   hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
-   hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
-   hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
 
-   hypre_CSRMatrixNumNonzeros(A) = new_nnz;
-   hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
-   hypre_CSRMatrixJ(A) = new_j;
-   hypre_CSRMatrixData(A) = new_data;
-   hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
+  hypre_CSRMatrixNumNonzeros(A) = new_nnz;
+  hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
+  hypre_CSRMatrixJ(A) = new_j;
+  hypre_CSRMatrixData(A) = new_data;
+  hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
 
-   return hypre_error_flag;
+  return hypre_error_flag;
 }
 
-/* type == 0, sum,
- *         1, abs sum (l-1)
- *         2, square sum (l-2)
- */
-template<HYPRE_Int type>
-__global__ void
-hypreCUDAKernel_CSRRowSum( HYPRE_Int      nrows,
-                           HYPRE_Int     *ia,
-                           HYPRE_Int     *ja,
-                           HYPRE_Complex *aa,
-                           HYPRE_Int     *CF_i,
-                           HYPRE_Int     *CF_j,
-                           HYPRE_Complex *row_sum,
-                           HYPRE_Complex  scal,
-                           HYPRE_Int      set)
+HYPRE_Int
+hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A )
 {
-   HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1, 1>();
-
-   if (row_i >= nrows)
-   {
-      return;
-   }
+  if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A))
+  {
+    return 0;
+  }
 
-   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-   HYPRE_Int p = 0, q = 0;
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim);
 
-   if (lane < 2)
-   {
-      p = read_only_load(ia + row_i + lane);
-   }
-   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+  HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE);
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRCheckDiagFirst, gDim, bDim,
+                    hypre_CSRMatrixNumRows(A),
+                    hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result );
 
-   HYPRE_Complex row_sum_i = 0.0;
+#if defined(HYPRE_USING_CUDA)
+  HYPRE_Int ierr = HYPRE_THRUST_CALL( reduce,
+                                      result,
+                                      result + hypre_CSRMatrixNumRows(A) );
+#elif defined(HYPRE_USING_SYCL)
+  HYPRE_Int ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce,
+                                      result,
+                                      result + hypre_CSRMatrixNumRows(A) );
+#endif
 
-   for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
-   {
-      if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) )
-      {
-         continue;
-      }
+  hypre_TFree(result, HYPRE_MEMORY_DEVICE);
 
-      HYPRE_Complex aii = aa[j];
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
-      if (type == 0)
-      {
-         row_sum_i += aii;
-      }
-      else if (type == 1)
-      {
-         row_sum_i += fabs(aii);
-      }
-      else if (type == 2)
-      {
-         row_sum_i += aii * aii;
-      }
-   }
-
-   row_sum_i = warp_reduce_sum(row_sum_i);
-
-   if (lane == 0)
-   {
-      if (set)
-      {
-         row_sum[row_i] = scal * row_sum_i;
-      }
-      else
-      {
-         row_sum[row_i] += scal * row_sum_i;
-      }
-   }
+  return ierr;
 }
 
-void
-hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
-                                    HYPRE_Int       *CF_i,
-                                    HYPRE_Int       *CF_j,
-                                    HYPRE_Complex   *row_sum,
-                                    HYPRE_Int        type,
-                                    HYPRE_Complex    scal,
-                                    const char      *set_or_add)
+HYPRE_Int
+hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix  *A )
 {
-   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-   dim3           bDim, gDim;
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
 
-   bDim = hypre_GetDefaultDeviceBlockDimension();
-   gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
-   if (type == 0)
-   {
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
-                         row_sum, scal, set_or_add[0] == 's' );
-   }
-   else if (type == 1)
-   {
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
-                         row_sum, scal, set_or_add[0] == 's' );
-   }
-   else if (type == 2)
-   {
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
-                         row_sum, scal, set_or_add[0] == 's' );
-   }
+  HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim,
+                   nrows, A_i, A_j, A_data);
+
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+  return hypre_error_flag;
 }
 
-/* type 0: diag
- *      1: abs diag
- *      2: diag inverse
- *      3: diag inverse sqrt
- *      4: abs diag inverse sqrt
- */
-__global__ void
-hypreCUDAKernel_CSRExtractDiag( HYPRE_Int      nrows,
-                                HYPRE_Int     *ia,
-                                HYPRE_Int     *ja,
-                                HYPRE_Complex *aa,
-                                HYPRE_Complex *d,
-                                HYPRE_Int      type)
+/* markA: array of size nnz(A), for pattern of (A and B), markA is the column indices as in A_J
+ * Otherwise, mark pattern not in A-B as -1 in markA
+ * Note the special treatment for diagonal entries of A (marked as -2) */
+HYPRE_Int
+hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A,
+                                hypre_CSRMatrix *B,
+                                HYPRE_Int       *markA,
+                                HYPRE_Int        diag_opt)
 {
-   HYPRE_Int row = hypre_cuda_get_grid_warp_id<1, 1>();
+  HYPRE_Int nrows = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int nnzA  = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Int nnzB  = hypre_CSRMatrixNumNonzeros(B);
 
-   if (row >= nrows)
-   {
-      return;
-   }
+  HYPRE_Int *Cii = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int *Cjj = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int *idx = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
 
-   HYPRE_Int lane = hypre_cuda_get_lane_id<1>();
-   HYPRE_Int p = 0, q = 0;
+  hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzA, hypre_CSRMatrixI(A), Cii);
+  hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA);
+  hypre_TMemcpy(Cjj,        hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
 
-   if (lane < 2)
-   {
-      p = read_only_load(ia + row + lane);
-   }
-   q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1);
-   p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0);
+#if defined(HYPRE_USING_CUDA)
+  HYPRE_THRUST_CALL( sequence, idx, idx + nnzA + nnzB );
+
+  HYPRE_THRUST_CALL( stable_sort_by_key,
+                     thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)),
+                     thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)) + nnzA + nnzB,
+                     idx );
+#elif defined(HYPRE_USING_SYCL)
+  HYPRE_ONEDPL_CALL( dpct::iota, idx, idx + nnzA + nnzB, 0 );
+
+  auto zipped_begin = oneapi::dpl::make_zip_iterator(Cii, Cjj, idx);
+  HYPRE_ONEDPL_CALL( std::stable_sort, zipped_begin, zipped_begin + nnzA + nnzB,
+                     [](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); } );
+#endif
 
-   HYPRE_Int has_diag = 0;
+  hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
 
-   for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE)
-   {
-      hypre_int find_diag = j < q && ja[j] == row;
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim);
 
-      if (find_diag)
-      {
-         if (type == 0)
-         {
-            d[row] = aa[j];
-         }
-         else if (type == 1)
-         {
-            d[row] = fabs(aa[j]);
-         }
-         else if (type == 2)
-         {
-            d[row] = 1.0 / aa[j];
-         }
-         else if (type == 3)
-         {
-            d[row] = 1.0 / sqrt(aa[j]);
-         }
-         else if (type == 4)
-         {
-            d[row] = 1.0 / sqrt(fabs(aa[j]));
-         }
-      }
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixIntersectPattern, gDim, bDim,
+                    nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt );
 
-      if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) )
-      {
-         has_diag = 1;
-         break;
-      }
-   }
+  hypre_TFree(Cii, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(Cjj, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(idx, HYPRE_MEMORY_DEVICE);
 
-   if (!has_diag && lane == 0)
-   {
-      d[row] = 0.0;
-   }
+  return hypre_error_flag;
 }
 
 void
@@ -1195,109 +1640,119 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A,
                                       HYPRE_Complex   *d,
                                       HYPRE_Int        type)
 {
-   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-   dim3           bDim, gDim;
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
 
-   bDim = hypre_GetDefaultDeviceBlockDimension();
-   gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
+  HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type );
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 }
 
 /* return C = [A; B] */
 hypre_CSRMatrix*
 hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B)
 {
-   hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) );
-
-   hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B),
-                                               hypre_CSRMatrixNumCols(A),
-                                               hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) );
-
-   HYPRE_Int     *C_i = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumRows(C) + 1,
-                                     HYPRE_MEMORY_DEVICE);
-   HYPRE_Int     *C_j = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumNonzeros(C),
-                                     HYPRE_MEMORY_DEVICE);
-   HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C),
-                                     HYPRE_MEMORY_DEVICE);
-
-   hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1,
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-   hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int,
-                 hypre_CSRMatrixNumRows(B),
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-   HYPRE_THRUST_CALL( transform,
-                      C_i + hypre_CSRMatrixNumRows(A) + 1,
-                      C_i + hypre_CSRMatrixNumRows(C) + 1,
-                      thrust::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)),
-                      C_i + hypre_CSRMatrixNumRows(A) + 1,
-                      thrust::plus<HYPRE_Int>() );
-
-   hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A),
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-   hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int,
-                 hypre_CSRMatrixNumNonzeros(B),
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-
-   hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A),
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-   hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex,
-                 hypre_CSRMatrixNumNonzeros(B),
-                 HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
-
-   hypre_CSRMatrixI(C) = C_i;
-   hypre_CSRMatrixJ(C) = C_j;
-   hypre_CSRMatrixData(C) = C_a;
-   hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
-
-   return C;
+  hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) );
+
+  hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B),
+                                              hypre_CSRMatrixNumCols(A),
+                                              hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) );
+
+  HYPRE_Int     *C_i = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE);
+  HYPRE_Int     *C_j = hypre_TAlloc(HYPRE_Int,     hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
+  HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE);
+
+  hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1,
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+#ifdef HYPRE_USING_SYCL
+  HYPRE_Int *const_iterator = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE);
+  hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, hypre_CSRMatrixNumNonzeros(A), (hypre_CSRMatrixNumRows(C) + 1)*sizeof(HYPRE_Int)).wait();
+
+  HYPRE_ONEDPL_CALL( std::transform,
+                     C_i + hypre_CSRMatrixNumRows(A) + 1,
+                     C_i + hypre_CSRMatrixNumRows(C) + 1,
+                     const_iterator, //dpct::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)),
+                     C_i + hypre_CSRMatrixNumRows(A) + 1,
+                     std::plus<HYPRE_Int>() );
+
+  hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE);
+#else
+  HYPRE_THRUST_CALL( transform,
+                     C_i + hypre_CSRMatrixNumRows(A) + 1,
+                     C_i + hypre_CSRMatrixNumRows(C) + 1,
+                     thrust::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)),
+                     C_i + hypre_CSRMatrixNumRows(A) + 1,
+                     thrust::plus<HYPRE_Int>() );
+#endif
+
+  hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+  hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+  hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B),
+                HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE);
+
+  hypre_CSRMatrixI(C) = C_i;
+  hypre_CSRMatrixJ(C) = C_j;
+  hypre_CSRMatrixData(C) = C_a;
+  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
+
+  return C;
 }
 
 /* A = alp * I */
 hypre_CSRMatrix *
 hypre_CSRMatrixIdentityDevice(HYPRE_Int n, HYPRE_Complex alp)
 {
-   hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n);
-
-   hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE);
+  hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n);
 
-   HYPRE_THRUST_CALL( sequence,
-                      hypre_CSRMatrixI(A),
-                      hypre_CSRMatrixI(A) + n + 1,
-                      0  );
+  hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE);
 
-   HYPRE_THRUST_CALL( sequence,
-                      hypre_CSRMatrixJ(A),
-                      hypre_CSRMatrixJ(A) + n,
-                      0  );
+#ifdef HYPRE_USING_SYCL
+  HYPRE_ONEDPL_CALL( dpct::iota,
+                     hypre_CSRMatrixI(A),
+                     hypre_CSRMatrixI(A) + n + 1,
+                     0  );
 
-   HYPRE_THRUST_CALL( fill,
-                      hypre_CSRMatrixData(A),
-                      hypre_CSRMatrixData(A) + n,
-                      alp );
+  HYPRE_ONEDPL_CALL( dpct::iota,
+                     hypre_CSRMatrixJ(A),
+                     hypre_CSRMatrixJ(A) + n,
+                     0  );
 
-   return A;
+  HYPRE_ONEDPL_CALL( std::fill,
+                     hypre_CSRMatrixData(A),
+                     hypre_CSRMatrixData(A) + n,
+                     alp );
+#else
+  HYPRE_THRUST_CALL( sequence,
+                     hypre_CSRMatrixI(A),
+                     hypre_CSRMatrixI(A) + n + 1,
+                     0  );
+
+  HYPRE_THRUST_CALL( sequence,
+                     hypre_CSRMatrixJ(A),
+                     hypre_CSRMatrixJ(A) + n,
+                     0  );
+
+  HYPRE_THRUST_CALL( fill,
+                     hypre_CSRMatrixData(A),
+                     hypre_CSRMatrixData(A) + n,
+                     alp );
+#endif
+  return A;
 }
 
-/* this predicate compares first and second element in a tuple in absolute value */
-/* first is assumed to be complex, second to be real > 0 */
-struct cabsfirst_greaterthan_second_pred : public
-   thrust::unary_function<thrust::tuple<HYPRE_Complex, HYPRE_Real>, bool>
-{
-   __host__ __device__
-   bool operator()(const thrust::tuple<HYPRE_Complex, HYPRE_Real>& t) const
-   {
-      const HYPRE_Complex i = thrust::get<0>(t);
-      const HYPRE_Real j = thrust::get<1>(t);
-
-      return hypre_cabs(i) > j;
-   }
-};
 
 /* drop the entries that are smaller than:
  *    tol if elmt_tols == null,
@@ -1307,248 +1762,210 @@ hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A,
                                        HYPRE_Real       tol,
                                        HYPRE_Real      *elmt_tols)
 {
-   HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
-   HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
-   HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
-   HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
-   HYPRE_Int     *A_ii   = NULL;
-   HYPRE_Int      new_nnz = 0;
-   HYPRE_Int     *new_ii;
-   HYPRE_Int     *new_j;
-   HYPRE_Complex *new_data;
-
-   if (elmt_tols == NULL)
-   {
-      new_nnz = HYPRE_THRUST_CALL( count_if,
-                                   A_data,
-                                   A_data + nnz,
-                                   thrust::not1(less_than<HYPRE_Complex>(tol)) );
-   }
-   else
-   {
-      new_nnz = HYPRE_THRUST_CALL( count_if,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)) + nnz,
-                                   cabsfirst_greaterthan_second_pred() );
-   }
-
-   if (new_nnz == nnz)
-   {
-      hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
-      return hypre_error_flag;
-   }
-
-   if (!A_ii)
-   {
-      A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
-   }
-   new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-   new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
-   new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
-
-   thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*, HYPRE_Complex*> > new_end;
-
-   if (elmt_tols == NULL)
-   {
-      new_end = HYPRE_THRUST_CALL( copy_if,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
-                                   A_data,
-                                   thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
-                                   thrust::not1(less_than<HYPRE_Complex>(tol)) );
-   }
-   else
-   {
-      new_end = HYPRE_THRUST_CALL( copy_if,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
-                                   thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)),
-                                   thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
-                                   cabsfirst_greaterthan_second_pred() );
-   }
-
-   hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int      nnz    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_ii   = NULL;
+  HYPRE_Int      new_nnz = 0;
+  HYPRE_Int     *new_ii;
+  HYPRE_Int     *new_j;
+  HYPRE_Complex *new_data;
+
+  if (elmt_tols == NULL)
+  {
+#ifdef HYPRE_USING_SYCL
+    new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
+                                 A_data,
+                                 A_data + nnz,
+                                 std::not_fn(less_than<HYPRE_Complex>(tol)) );
+#else
+    new_nnz = HYPRE_THRUST_CALL( count_if,
+                                 A_data,
+                                 A_data + nnz,
+                                 thrust::not1(less_than<HYPRE_Complex>(tol)) );
+#endif
+  }
+  else
+  {
+#ifdef HYPRE_USING_SYCL
+    auto first = oneapi::dpl::make_zip_iterator(A_data, elmt_tols);
+    new_nnz = HYPRE_ONEDPL_CALL( std::count_if,
+                                 first,
+                                 first + nnz,
+                                 cabsfirst_greaterthan_second_pred() );
+#else
+    new_nnz = HYPRE_THRUST_CALL( count_if,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)) + nnz,
+                                 cabsfirst_greaterthan_second_pred() );
+#endif
+  }
+
+  if (new_nnz == nnz)
+  {
+    hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE);
+    return hypre_error_flag;
+  }
+
+  if (!A_ii)
+  {
+    A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i);
+  }
+  new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+  new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE);
+  new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE);
+
+#ifdef HYPRE_USING_SYCL
+  oneapi::dpl::zip_iterator< HYPRE_Int*, HYPRE_Int*, HYPRE_Complex* > new_end;
+  auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data);
+
+  if (elmt_tols == NULL)
+  {
+    new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                 first, first + nnz,
+                                 A_data,
+                                 oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data),
+                                 std::not_fn(less_than<HYPRE_Complex>(tol)) );
+  }
+  else
+  {
+    new_end = HYPRE_ONEDPL_CALL( dpct::copy_if,
+                                 first, first + nnz,
+                                 oneapi::dpl::make_zip_iterator(A_data, elmt_tols),
+                                 oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data),
+                                 cabsfirst_greaterthan_second_pred() );
+  }
+
+  // TODO: abb fix this
+  // hypre_assert( thrust::get<0>(*new_end) == new_ii + new_nnz );
+#else
+  thrust::zip_iterator< thrust::tuple<HYPRE_Int*, HYPRE_Int*, HYPRE_Complex*> > new_end;
+
+  if (elmt_tols == NULL)
+  {
+    new_end = HYPRE_THRUST_CALL( copy_if,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
+                                 A_data,
+                                 thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
+                                 thrust::not1(less_than<HYPRE_Complex>(tol)) );
+  }
+  else
+  {
+    new_end = HYPRE_THRUST_CALL( copy_if,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz,
+                                 thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)),
+                                 thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)),
+                                 cabsfirst_greaterthan_second_pred() );
+  }
+
+  hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz );
+#endif
 
-   hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
-   hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
-   hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
-   hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_ii,   HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_i,    HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_j,    HYPRE_MEMORY_DEVICE);
+  hypre_TFree(A_data, HYPRE_MEMORY_DEVICE);
 
-   hypre_CSRMatrixNumNonzeros(A) = new_nnz;
-   hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
-   hypre_CSRMatrixJ(A) = new_j;
-   hypre_CSRMatrixData(A) = new_data;
-   hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
+  hypre_CSRMatrixNumNonzeros(A) = new_nnz;
+  hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii);
+  hypre_CSRMatrixJ(A) = new_j;
+  hypre_CSRMatrixData(A) = new_data;
+  hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE);
 
-   return hypre_error_flag;
+  return hypre_error_flag;
 }
 
-/* mark is of size nA
- * diag_option: 1: special treatment for diag entries, mark as -2
- */
-__global__ void
-hypreCUDAKernel_CSRMatrixIntersectPattern(HYPRE_Int  n,
-                                          HYPRE_Int  nA,
-                                          HYPRE_Int *rowid,
-                                          HYPRE_Int *colid,
-                                          HYPRE_Int *idx,
-                                          HYPRE_Int *mark,
-                                          HYPRE_Int  diag_option)
-{
-   HYPRE_Int i = hypre_cuda_get_grid_thread_id<1, 1>();
-
-   if (i >= n)
-   {
-      return;
-   }
-
-   HYPRE_Int r1 = read_only_load(&rowid[i]);
-   HYPRE_Int c1 = read_only_load(&colid[i]);
-   HYPRE_Int j = read_only_load(&idx[i]);
-
-   if (0 == diag_option)
-   {
-      if (j < nA)
-      {
-         HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
-         HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
-         if (r1 == r2 && c1 == c2)
-         {
-            mark[j] = c1;
-         }
-         else
-         {
-            mark[j] = -1;
-         }
-      }
-   }
-   else if (1 == diag_option)
-   {
-      if (j < nA)
-      {
-         if (r1 == c1)
-         {
-            mark[j] = -2;
-         }
-         else
-         {
-            HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1;
-            HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1;
-            if (r1 == r2 && c1 == c2)
-            {
-               mark[j] = c1;
-            }
-            else
-            {
-               mark[j] = -1;
-            }
-         }
-      }
-   }
-}
-
-/* markA: array of size nnz(A), for pattern of (A and B), markA is the column indices as in A_J
- * Otherwise, mark pattern not in A-B as -1 in markA
- * Note the special treatment for diagonal entries of A (marked as -2) */
-HYPRE_Int
-hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A,
-                                hypre_CSRMatrix *B,
-                                HYPRE_Int       *markA,
-                                HYPRE_Int        diag_opt)
+void
+hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A,
+                                    HYPRE_Int       *CF_i,
+                                    HYPRE_Int       *CF_j,
+                                    HYPRE_Complex   *row_sum,
+                                    HYPRE_Int        type,
+                                    HYPRE_Complex    scal,
+                                    const char      *set_or_add)
 {
-   HYPRE_Int nrows = hypre_CSRMatrixNumRows(A);
-   HYPRE_Int nnzA  = hypre_CSRMatrixNumNonzeros(A);
-   HYPRE_Int nnzB  = hypre_CSRMatrixNumNonzeros(B);
-
-   HYPRE_Int *Cii = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
-   HYPRE_Int *Cjj = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
-   HYPRE_Int *idx = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE);
-
-   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzA, hypre_CSRMatrixI(A), Cii);
-   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA);
-   hypre_TMemcpy(Cjj,        hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE,
-                 HYPRE_MEMORY_DEVICE);
-   hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE,
-                 HYPRE_MEMORY_DEVICE);
-   HYPRE_THRUST_CALL( sequence, idx, idx + nnzA + nnzB );
-
-   HYPRE_THRUST_CALL( stable_sort_by_key,
-                      thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)),
-                      thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)) + nnzA + nnzB,
-                      idx );
-
-   hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE,
-                 HYPRE_MEMORY_DEVICE);
-
-   dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
-   dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim);
-
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixIntersectPattern, gDim, bDim,
-                      nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt );
-
-   hypre_TFree(Cii, HYPRE_MEMORY_DEVICE);
-   hypre_TFree(Cjj, HYPRE_MEMORY_DEVICE);
-   hypre_TFree(idx, HYPRE_MEMORY_DEVICE);
-
-   return hypre_error_flag;
+  HYPRE_Int      nrows  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Complex *A_data = hypre_CSRMatrixData(A);
+  HYPRE_Int     *A_i    = hypre_CSRMatrixI(A);
+  HYPRE_Int     *A_j    = hypre_CSRMatrixJ(A);
+
+  dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
+  dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);
+
+  if (type == 0)
+  {
+    HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
+                      row_sum, scal, set_or_add[0] == 's' );
+  }
+  else if (type == 1)
+  {
+    HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
+                      row_sum, scal, set_or_add[0] == 's' );
+  }
+  else if (type == 2)
+  {
+    HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j,
+                      row_sum, scal, set_or_add[0] == 's' );
+  }
+
+  hypre_SyncDeviceComputeStream(hypre_handle());
 }
 
-#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */
-
-#if defined(HYPRE_USING_GPU)
-
 HYPRE_Int
 hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix  *A,
                                hypre_CSRMatrix **AT_ptr,
                                HYPRE_Int         data)
 {
-   HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
-   HYPRE_Int        *A_i      = hypre_CSRMatrixI(A);
-   HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
-   HYPRE_Int         nrows_A  = hypre_CSRMatrixNumRows(A);
-   HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
-   HYPRE_Int         nnz_A    = hypre_CSRMatrixNumNonzeros(A);
-   HYPRE_Complex    *C_data;
-   HYPRE_Int        *C_i;
-   HYPRE_Int        *C_j;
-   hypre_CSRMatrix  *C;
-
-
-   /* trivial case */
-   if (nnz_A == 0)
-   {
-      C_i =    hypre_CTAlloc(HYPRE_Int,     ncols_A + 1, HYPRE_MEMORY_DEVICE);
-      C_j =    hypre_CTAlloc(HYPRE_Int,     0,           HYPRE_MEMORY_DEVICE);
-      C_data = hypre_CTAlloc(HYPRE_Complex, 0,           HYPRE_MEMORY_DEVICE);
-   }
-   else
-   {
+  HYPRE_Complex    *A_data   = hypre_CSRMatrixData(A);
+  HYPRE_Int        *A_i      = hypre_CSRMatrixI(A);
+  HYPRE_Int        *A_j      = hypre_CSRMatrixJ(A);
+  HYPRE_Int         nrows_A  = hypre_CSRMatrixNumRows(A);
+  HYPRE_Int         ncols_A  = hypre_CSRMatrixNumCols(A);
+  HYPRE_Int         nnz_A    = hypre_CSRMatrixNumNonzeros(A);
+  HYPRE_Complex    *C_data;
+  HYPRE_Int        *C_i;
+  HYPRE_Int        *C_j;
+  hypre_CSRMatrix  *C;
+
+
+  /* trivial case */
+  if (nnz_A == 0)
+  {
+    C_i =    hypre_CTAlloc(HYPRE_Int,     ncols_A + 1, HYPRE_MEMORY_DEVICE);
+    C_j =    hypre_CTAlloc(HYPRE_Int,     0,           HYPRE_MEMORY_DEVICE);
+    C_data = hypre_CTAlloc(HYPRE_Complex, 0,           HYPRE_MEMORY_DEVICE);
+  }
+  else
+  {
 #if defined(HYPRE_USING_CUSPARSE)
-      hypreDevice_CSRSpTransCusparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
-                                     data);
+    hypreDevice_CSRSpTransCusparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
+                                   data);
 #elif defined(HYPRE_USING_ROCSPARSE)
-      hypreDevice_CSRSpTransRocsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
-                                      data);
+    hypreDevice_CSRSpTransRocsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
+                                    data);
 #elif defined(HYPRE_USING_ONEMKLSPARSE)
-      hypreDevice_CSRSpTransOnemklsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
-                                         data);
+    hypreDevice_CSRSpTransOnemklsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data,
+                                       data);
 #else
-      hypreDevice_CSRSpTrans(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data);
+    hypreDevice_CSRSpTrans(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data);
 #endif
-   }
+  }
 
-   C = hypre_CSRMatrixCreate(ncols_A, nrows_A, nnz_A);
-   hypre_CSRMatrixI(C) = C_i;
-   hypre_CSRMatrixJ(C) = C_j;
-   hypre_CSRMatrixData(C) = C_data;
-   hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
+  C = hypre_CSRMatrixCreate(ncols_A, nrows_A, nnz_A);
+  hypre_CSRMatrixI(C) = C_i;
+  hypre_CSRMatrixJ(C) = C_j;
+  hypre_CSRMatrixData(C) = C_data;
+  hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE;
 
-   *AT_ptr = C;
+  *AT_ptr = C;
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+  hypre_SyncDeviceComputeStream(hypre_handle());
 
-   return hypre_error_flag;
+  return hypre_error_flag;
 }
 
 #endif /* #if defined(HYPRE_USING_GPU) */
diff --git a/src/seq_mv/csr_matrix.c b/src/seq_mv/csr_matrix.c
index 275625ec9f..98a26a942e 100644
--- a/src/seq_mv/csr_matrix.c
+++ b/src/seq_mv/csr_matrix.c
@@ -44,7 +44,7 @@ hypre_CSRMatrixCreate( HYPRE_Int num_rows,
    /* set defaults */
    hypre_CSRMatrixOwnsData(matrix)       = 1;
 
-#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE)
+#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE)
    hypre_CSRMatrixSortedJ(matrix)        = NULL;
    hypre_CSRMatrixSortedData(matrix)     = NULL;
    hypre_CSRMatrixCsrsvData(matrix)      = NULL;
diff --git a/src/seq_mv/csr_matvec_device.c b/src/seq_mv/csr_matvec_device.c
index 811040a510..8b61018ccd 100644
--- a/src/seq_mv/csr_matvec_device.c
+++ b/src/seq_mv/csr_matvec_device.c
@@ -117,7 +117,7 @@ hypre_CSRMatrixMatvecDevice( HYPRE_Int        trans,
       hypre_CSRMatrixMatvecDevice2(trans, alpha, A, x, beta, y, offset);
    }
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
    hypre_GpuProfilingPopRange();
@@ -201,7 +201,7 @@ hypre_CSRMatrixMatvecCusparseNewAPI( HYPRE_Int        trans,
 #endif
                                      dBuffer) );
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
    if (trans)
    {
diff --git a/src/seq_mv/csr_spgemm_device.c b/src/seq_mv/csr_spgemm_device.c
index 7d44c2cd05..b4074dadb9 100644
--- a/src/seq_mv/csr_spgemm_device.c
+++ b/src/seq_mv/csr_spgemm_device.c
@@ -89,7 +89,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
 #endif
          hypreDevice_CSRSpGemmRownnz(m, k, n, d_ia, d_ja, d_ib, d_jb, 0 /* without input rc */, d_rc);
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("Rownnz time %f\n", t2);
 #endif
@@ -101,7 +101,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
                                                         1 /* exact row nnz */,
                                                         &d_ic, &d_jc, &d_c, &nnzC);
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("SpGemmNumerical time %f\n", t2);
 #endif
@@ -115,7 +115,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
 #endif
          hypreDevice_CSRSpGemmRownnzEstimate(m, k, n, d_ia, d_ja, d_ib, d_jb, d_rc);
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("RownnzEst time %f\n", t2);
 #endif
@@ -126,7 +126,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
          hypreDevice_CSRSpGemmNumerWithRownnzEstimate(m, k, n, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_rc,
                                                       &d_ic, &d_jc, &d_c, &nnzC);
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("SpGemmNumerical time %f\n", t2);
 #endif
@@ -140,7 +140,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
 #endif
          hypreDevice_CSRSpGemmRownnzEstimate(m, k, n, d_ia, d_ja, d_ib, d_jb, d_rc);
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("RownnzEst time %f\n", t2);
 #endif
@@ -157,7 +157,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
                                                       d_rc + 2 * m,
                                                       thrust::identity<HYPRE_Int>() );
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("RownnzBound time %f\n", t2);
 #endif
@@ -169,7 +169,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix  *A,
                                                         rownnz_exact,
                                                         &d_ic, &d_jc, &d_c, &nnzC);
 #ifdef HYPRE_SPGEMM_TIMING
-         hypre_SyncCudaComputeStream(hypre_handle());
+         hypre_SyncDeviceComputeStream(hypre_handle());
          t2 = hypre_MPI_Wtime() - t1;
          hypre_printf("SpGemmNumerical time %f\n", t2);
 #endif
diff --git a/src/seq_mv/csr_spgemm_device_attempt.c b/src/seq_mv/csr_spgemm_device_attempt.c
index 4e61662bdf..7a3fb9e4c6 100644
--- a/src/seq_mv/csr_spgemm_device_attempt.c
+++ b/src/seq_mv/csr_spgemm_device_attempt.c
@@ -506,7 +506,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int       m,
       // for cases where one WARP works on a row
       dim3 gDim( (m + bDim.z - 1) / bDim.z );
 
-      HYPRE_CUDA_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 1, hash_type>),
+      HYPRE_GPU_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 1, hash_type>),
                           gDim, bDim, /* shmem_size, */
                           m, NULL, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_js, d_as, d_ghash1_i, d_ghash1_j, d_ghash1_a,
                           d_rc, d_rf );
@@ -542,7 +542,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int       m,
       // for cases where one WARP works on a row
       dim3 gDim( (num_failed_rows + bDim.z - 1) / bDim.z );
 
-      HYPRE_CUDA_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 2, hash_type>),
+      HYPRE_GPU_LAUNCH ( (hypre_spgemm_attempt<num_warps_per_block, shmem_hash_size, 2, hash_type>),
                           gDim, bDim, /* shmem_size, */
                           num_failed_rows, rf_ind, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_js, d_as, d_ghash2_i, d_ghash2_j,
                           d_ghash2_a,
@@ -563,7 +563,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int       m,
       // for cases where one WARP works on a row
       dim3 gDim( (m + bDim.z - 1) / bDim.z );
 
-      HYPRE_CUDA_LAUNCH( (hypre_spgemm_copy_from_hash_into_C<num_warps_per_block, shmem_hash_size>), gDim,
+      HYPRE_GPU_LAUNCH( (hypre_spgemm_copy_from_hash_into_C<num_warps_per_block, shmem_hash_size>), gDim,
                          bDim,
                          m, d_rf,
                          d_js, d_as,
diff --git a/src/seq_mv/csr_spgemm_device_confident.c b/src/seq_mv/csr_spgemm_device_confident.c
index 452acd52fb..871f27f67d 100644
--- a/src/seq_mv/csr_spgemm_device_confident.c
+++ b/src/seq_mv/csr_spgemm_device_confident.c
@@ -467,7 +467,7 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int       m,
 
    hypre_create_ija(m, d_rc, d_ic, &d_jc, &d_c, &nnzC_nume);
 
-   HYPRE_CUDA_LAUNCH ( (hypre_spgemm_numeric < num_warps_per_block, shmem_hash_size, !exact_rownnz,
+   HYPRE_GPU_LAUNCH ( (hypre_spgemm_numeric < num_warps_per_block, shmem_hash_size, !exact_rownnz,
                         hash_type > ),
                        gDim, bDim, /* shmem_size, */
                        m, /* k, n, */ d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_ic, d_jc, d_c, d_rc,
@@ -493,7 +493,7 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int       m,
 
          /* copy to the final C */
          dim3 gDim( (m + bDim.z - 1) / bDim.z );
-         HYPRE_CUDA_LAUNCH( (hypre_spgemm_copy_from_Cext_into_C<num_warps_per_block>), gDim, bDim,
+         HYPRE_GPU_LAUNCH( (hypre_spgemm_copy_from_Cext_into_C<num_warps_per_block>), gDim, bDim,
                             m, d_ic, d_jc, d_c, d_ic_new, d_jc_new, d_c_new );
 
          hypre_TFree(d_ic, HYPRE_MEMORY_DEVICE);
diff --git a/src/seq_mv/csr_spgemm_device_rowbound.c b/src/seq_mv/csr_spgemm_device_rowbound.c
index 094b5a82e2..9697eb83f4 100644
--- a/src/seq_mv/csr_spgemm_device_rowbound.c
+++ b/src/seq_mv/csr_spgemm_device_rowbound.c
@@ -313,19 +313,19 @@ hypre_spgemm_rownnz_attempt(HYPRE_Int  m,
     * ---------------------------------------------------------------------------*/
    if (hash_type == 'L')
    {
-      HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'L'>),
+      HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'L'>),
                          gDim, bDim,
                          m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf );
    }
    else if (hash_type == 'Q')
    {
-      HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'Q'>),
+      HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'Q'>),
                          gDim, bDim,
                          m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf );
    }
    else if (hash_type == 'D')
    {
-      HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'D'>),
+      HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic<num_warps_per_block, shmem_hash_size, ATTEMPT, 'D'>),
                          gDim, bDim,
                          m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf );
    }
diff --git a/src/seq_mv/csr_spgemm_device_rowest.c b/src/seq_mv/csr_spgemm_device_rowest.c
index f8f65c216f..50f76b081e 100644
--- a/src/seq_mv/csr_spgemm_device_rowest.c
+++ b/src/seq_mv/csr_spgemm_device_rowest.c
@@ -287,11 +287,11 @@ void csr_spmm_rownnz_cohen(HYPRE_Int M, HYPRE_Int K, HYPRE_Int N, HYPRE_Int *d_i
 
    dim3 gDim( (nsamples * N + bDim.z * HYPRE_WARP_SIZE - 1) / (bDim.z * HYPRE_WARP_SIZE) );
 
-   HYPRE_CUDA_LAUNCH( expdistfromuniform, gDim, bDim, nsamples * N, d_V1 );
+   HYPRE_GPU_LAUNCH( expdistfromuniform, gDim, bDim, nsamples * N, d_V1 );
 
    /* step-1: layer 3-2 */
    gDim.x = (K + bDim.z - 1) / bDim.z;
-   HYPRE_CUDA_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 2>), gDim,
+   HYPRE_GPU_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 2>), gDim,
                       bDim,
                       K, d_ib, d_jb, d_V1, d_V2, NULL, nsamples, NULL, NULL, -1.0);
 
@@ -301,7 +301,7 @@ void csr_spmm_rownnz_cohen(HYPRE_Int M, HYPRE_Int K, HYPRE_Int N, HYPRE_Int *d_i
    d_V3 = (T*) d_rc;
 
    gDim.x = (M + bDim.z - 1) / bDim.z;
-   HYPRE_CUDA_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 1>), gDim,
+   HYPRE_GPU_LAUNCH( (cohen_rowest_kernel<T, NUM_WARPS_PER_BLOCK, SHMEM_SIZE_PER_WARP, 1>), gDim,
                       bDim,
                       M, d_ia, d_ja, d_V2, d_V3, d_rc, nsamples, d_low, d_upp, mult_factor);
 
@@ -336,13 +336,13 @@ hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n,
    if (row_est_mtd == 1)
    {
       /* naive overestimate */
-      HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'U', num_warps_per_block>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'U', num_warps_per_block>), gDim, bDim,
                          m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, NULL, d_rc );
    }
    else if (row_est_mtd == 2)
    {
       /* naive underestimate */
-      HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'L', num_warps_per_block>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'L', num_warps_per_block>), gDim, bDim,
                          m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, d_rc, NULL );
    }
    else if (row_est_mtd == 3)
@@ -361,7 +361,7 @@ hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n,
       HYPRE_Int *d_low = d_low_upp;
       HYPRE_Int *d_upp = d_low_upp + m;
 
-      HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'B', num_warps_per_block>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'B', num_warps_per_block>), gDim, bDim,
                          m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, d_low, d_upp );
 
       /* Cohen's algorithm, stochastic approach */
diff --git a/src/seq_mv/csr_spgemm_device_util.c b/src/seq_mv/csr_spgemm_device_util.c
index fac7e8e5ef..8153d82819 100644
--- a/src/seq_mv/csr_spgemm_device_util.c
+++ b/src/seq_mv/csr_spgemm_device_util.c
@@ -103,14 +103,14 @@ hypre_SpGemmCreateGlobalHashTable( HYPRE_Int       num_rows,        /* number of
    {
       ghash_i = hypre_TAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE);
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_ghash, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim,
                          num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE );
    }
    else if (type == 2)
    {
       ghash_i = hypre_CTAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE);
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
-      HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim,
                          num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE );
    }
 
diff --git a/src/seq_mv/csr_spmv_device.c b/src/seq_mv/csr_spmv_device.c
index d5d62d932a..1ae93fc279 100644
--- a/src/seq_mv/csr_spmv_device.c
+++ b/src/seq_mv/csr_spmv_device.c
@@ -170,7 +170,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int      nrows,
       const HYPRE_Int group_size = 32;
       const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
       const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
-      HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
                          nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
    }
    else if (rownnz >= 32)
@@ -178,7 +178,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int      nrows,
       const HYPRE_Int group_size = 16;
       const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
       const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
-      HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
                          nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
    }
    else if (rownnz >= 16)
@@ -186,7 +186,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int      nrows,
       const HYPRE_Int group_size = 8;
       const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
       const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
-      HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
                          nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
    }
    else if (rownnz >= 8)
@@ -194,7 +194,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int      nrows,
       const HYPRE_Int group_size = 4;
       const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
       const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
-      HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
                          nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
    }
    else
@@ -202,7 +202,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int      nrows,
       const HYPRE_Int group_size = 4;
       const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size;
       const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block);
-      HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
+      HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle<F, group_size, HYPRE_Real>), gDim, bDim,
                          nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y );
    }
 
diff --git a/src/seq_mv/csr_sptrans_device.c b/src/seq_mv/csr_sptrans_device.c
index 548665ed2e..bd85778a03 100644
--- a/src/seq_mv/csr_sptrans_device.c
+++ b/src/seq_mv/csr_sptrans_device.c
@@ -137,7 +137,7 @@ hypreDevice_CSRSpTransRocsparse(HYPRE_Int   m,        HYPRE_Int   n,        HYPR
    *d_ac_out = csc_a;
 
 #ifdef HYPRE_PROFILE
-   hypre_SyncCudaDevice(hypre_handle())
+   hypre_SyncDevice(hypre_handle())
    hypre_profile_times[HYPRE_TIMER_ID_SPTRANS] += hypre_MPI_Wtime();
 #endif
 
diff --git a/src/seq_mv/protos.h b/src/seq_mv/protos.h
index 9081b58c20..4d7b494ad9 100644
--- a/src/seq_mv/protos.h
+++ b/src/seq_mv/protos.h
@@ -281,6 +281,8 @@ HYPRE_Int hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int
                                              HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out,
                                              HYPRE_Complex **d_ac_out, HYPRE_Int want_data);
 
+HYPRE_Int hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data);
+
 HYPRE_Int hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, hypre_CSRMatrix *B, hypre_CSRMatrix **C_ptr);
 
 HYPRE_Int hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n,
diff --git a/src/seq_mv/seq_mv.h b/src/seq_mv/seq_mv.h
index de34685237..485d045d08 100644
--- a/src/seq_mv/seq_mv.h
+++ b/src/seq_mv/seq_mv.h
@@ -553,6 +553,8 @@ HYPRE_Int hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int
                                              HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out,
                                              HYPRE_Complex **d_ac_out, HYPRE_Int want_data);
 
+HYPRE_Int hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data);
+
 HYPRE_Int hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, hypre_CSRMatrix *B, hypre_CSRMatrix **C_ptr);
 
 HYPRE_Int hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n,
diff --git a/src/seq_mv/vector.c b/src/seq_mv/vector.c
index 8b024f39c5..bfab868fbb 100644
--- a/src/seq_mv/vector.c
+++ b/src/seq_mv/vector.c
@@ -300,7 +300,7 @@ hypre_SeqVectorSetConstantValues( hypre_Vector *v,
 #endif /* defined(HYPRE_USING_CUDA)  || defined(HYPRE_USING_HIP) */
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -488,7 +488,7 @@ hypre_SeqVectorScale( HYPRE_Complex alpha,
 #endif /* defined(HYPRE_USING_CUDA)  || defined(HYPRE_USING_HIP) */
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -542,7 +542,7 @@ hypre_SeqVectorAxpy( HYPRE_Complex alpha,
 #endif /* defined(HYPRE_USING_CUDA)  || defined(HYPRE_USING_HIP) */
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -596,7 +596,7 @@ hypre_SeqVectorElmdivpy( hypre_Vector *x,
    }
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -647,7 +647,7 @@ hypre_SeqVectorElmdivpyMarked( hypre_Vector *x,
    }
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -704,7 +704,7 @@ hypre_SeqVectorInnerProd( hypre_Vector *x,
 #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
 #ifdef HYPRE_PROFILE
@@ -806,7 +806,7 @@ hypre_SeqVectorMax( HYPRE_Complex alpha,
 
 #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
 
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 
 #ifdef HYPRE_PROFILE
    hypre_profile_times[HYPRE_TIMER_ID_BLAS1] += hypre_MPI_Wtime();
diff --git a/src/sstruct_mv/sstruct_matrix.c b/src/sstruct_mv/sstruct_matrix.c
index 1d9ce85366..e51066abcc 100644
--- a/src/sstruct_mv/sstruct_matrix.c
+++ b/src/sstruct_mv/sstruct_matrix.c
@@ -392,7 +392,7 @@ hypre_SStructPMatrixSetBoxValues( hypre_SStructPMatrix *pmatrix,
                                   values, action, -1, 0);
    /* TODO: Why need DeviceSync? */
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #endif
    /* set (AddTo/Get) or clear (Set) values outside the grid in ghost zones */
    if (action != 0)
diff --git a/src/sstruct_mv/sstruct_vector.c b/src/sstruct_mv/sstruct_vector.c
index fdeeae6421..fa8db02a35 100644
--- a/src/sstruct_mv/sstruct_vector.c
+++ b/src/sstruct_mv/sstruct_vector.c
@@ -247,7 +247,7 @@ hypre_SStructPVectorSetBoxValues( hypre_SStructPVector *pvector,
 
    /* TODO: Why need DeviceSync? */
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #endif
    /* set (AddTo/Get) or clear (Set) values outside the grid in ghost zones */
    if (action != 0)
diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp
index a3845755ef..c30e3398ae 100644
--- a/src/struct_mv/_hypre_struct_mv.hpp
+++ b/src/struct_mv/_hypre_struct_mv.hpp
@@ -800,7 +800,7 @@ extern "C++"
          const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
          const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
-         HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
+         HYPRE_GPU_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
       }
    }
 
@@ -861,7 +861,7 @@ extern "C++"
          hypre_printf("length= %d, blocksize = %d, gridsize = %d\n", length, bDim.x, gDim.x);
          */
 
-         HYPRE_CUDA_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
+         HYPRE_GPU_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
       }
    }
 
diff --git a/src/struct_mv/boxloop_cuda.h b/src/struct_mv/boxloop_cuda.h
index ef36562ef5..d453864f8b 100644
--- a/src/struct_mv/boxloop_cuda.h
+++ b/src/struct_mv/boxloop_cuda.h
@@ -74,7 +74,7 @@ extern "C++"
          const dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
          const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim);
 
-         HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
+         HYPRE_GPU_LAUNCH( forall_kernel, gDim, bDim, loop_body, length );
       }
    }
 
@@ -135,7 +135,7 @@ extern "C++"
          hypre_printf("length= %d, blocksize = %d, gridsize = %d\n", length, bDim.x, gDim.x);
          */
 
-         HYPRE_CUDA_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
+         HYPRE_GPU_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body );
       }
    }
 
diff --git a/src/test/ij.c b/src/test/ij.c
index 26640554c7..a3dcfc76b3 100644
--- a/src/test/ij.c
+++ b/src/test/ij.c
@@ -3406,7 +3406,7 @@ main( hypre_int argc,
       }
 
 #if defined(HYPRE_USING_GPU)
-      hypre_SyncCudaDevice(hypre_handle());
+      hypre_SyncDevice(hypre_handle());
 #endif
 
       hypre_EndTiming(time_index);
@@ -3766,7 +3766,7 @@ main( hypre_int argc,
 #endif
 
 #if defined(HYPRE_USING_GPU)
-      hypre_SyncCudaDevice(hypre_handle());
+      hypre_SyncDevice(hypre_handle());
 #endif
 
       hypre_EndTiming(time_index);
@@ -3804,7 +3804,7 @@ main( hypre_int argc,
 #endif
 
 #if defined(HYPRE_USING_GPU)
-      hypre_SyncCudaDevice(hypre_handle());
+      hypre_SyncDevice(hypre_handle());
 #endif
 
       hypre_EndTiming(time_index);
@@ -3865,7 +3865,7 @@ main( hypre_int argc,
 #endif
 
 #if defined(HYPRE_USING_GPU)
-      hypre_SyncCudaDevice(hypre_handle());
+      hypre_SyncDevice(hypre_handle());
 #endif
 
       tt = hypre_MPI_Wtime() - tt;
@@ -3897,7 +3897,7 @@ main( hypre_int argc,
 #endif
 
 #if defined(HYPRE_USING_GPU)
-      hypre_SyncCudaDevice(hypre_handle());
+      hypre_SyncDevice(hypre_handle());
 #endif
 
       tt = hypre_MPI_Wtime() - tt;
diff --git a/src/test/ij_assembly.c b/src/test/ij_assembly.c
index bb17d32803..fb28c9ba55 100644
--- a/src/test/ij_assembly.c
+++ b/src/test/ij_assembly.c
@@ -678,7 +678,7 @@ test_Set(MPI_Comm             comm,
    chunk_size = nrows / nchunks;
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStart();
 #endif
@@ -707,7 +707,7 @@ test_Set(MPI_Comm             comm,
    HYPRE_IJMatrixAssemble(ij_A);
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStop();
 #endif
@@ -831,7 +831,7 @@ test_SetOffProc(HYPRE_ParCSRMatrix    parcsr_A,
    chunk_size = nrows / nchunks;
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #endif
 
    time_index = hypre_InitializeTiming("Test SetValues OffProc");
@@ -862,7 +862,7 @@ test_SetOffProc(HYPRE_ParCSRMatrix    parcsr_A,
    //cudaProfilerStop();
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #endif
 
    hypre_EndTiming(time_index);
@@ -945,7 +945,7 @@ test_SetSet(MPI_Comm             comm,
 #endif
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStart();
 #endif
@@ -996,7 +996,7 @@ test_SetSet(MPI_Comm             comm,
    HYPRE_IJMatrixAssemble(ij_A);
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStop();
 #endif
@@ -1072,7 +1072,7 @@ test_AddSet(MPI_Comm             comm,
 #endif
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStart();
 #endif
@@ -1120,7 +1120,7 @@ test_AddSet(MPI_Comm             comm,
    HYPRE_IJMatrixAssemble(ij_A);
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStop();
 #endif
@@ -1178,7 +1178,7 @@ test_SetAddSet(MPI_Comm             comm,
    chunk_size = nrows / nchunks;
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStart();
 #endif
@@ -1244,7 +1244,7 @@ test_SetAddSet(MPI_Comm             comm,
    HYPRE_IJMatrixAssemble(ij_A);
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaDevice(hypre_handle());
+   hypre_SyncDevice(hypre_handle());
 #if defined(CUDA_PROFILER)
    cudaProfilerStop();
 #endif
diff --git a/src/test/ij_mm.c b/src/test/ij_mm.c
index 4bbf24fc39..807e9b1630 100644
--- a/src/test/ij_mm.c
+++ b/src/test/ij_mm.c
@@ -161,7 +161,7 @@ void runjob1( HYPRE_ParCSRMatrix parcsr_A,
 
       if (i == rep - 1)
       {
-         hypre_SyncCudaDevice(hypre_handle());
+         hypre_SyncDevice(hypre_handle());
          //cudaProfilerStop();
          hypre_EndTiming(time_index);
          hypre_PrintTiming("Device Parcsr Matrix-by-Matrix, A*A", hypre_MPI_COMM_WORLD);
@@ -350,7 +350,7 @@ void runjob2( HYPRE_ParCSRMatrix parcsr_A,
 
       if (i == 1)
       {
-         hypre_SyncCudaDevice(hypre_handle());
+         hypre_SyncDevice(hypre_handle());
          //cudaProfilerStop();
          hypre_EndTiming(time_index);
          hypre_PrintTiming("Device Parcsr Matrix-by-Matrix, RAP2", hypre_MPI_COMM_WORLD);
@@ -452,7 +452,7 @@ main( hypre_int argc,
    HYPRE_Init();
 
    /* for timing, sync after kernels */
-   hypre_SetSyncCudaCompute(1);
+   hypre_SetSyncDeviceCompute(1);
 
 #if defined(HYPRE_USING_CUDA)
    hypre_HandleDefaultExecPolicy(hypre_handle()) = HYPRE_EXEC_DEVICE;
diff --git a/src/test/zboxloop.c b/src/test/zboxloop.c
index f836aba02e..592ab6a158 100644
--- a/src/test/zboxloop.c
+++ b/src/test/zboxloop.c
@@ -20,8 +20,6 @@
  * Test driver to time new boxloops and compare to the old ones
  *--------------------------------------------------------------------------*/
 
-#define DEVICE_VAR
-
 hypre_int
 main( hypre_int argc,
       char *argv[] )
@@ -39,6 +37,7 @@ main( hypre_int argc,
    //HYPRE_Int         xi1, xi2, xi3, xi4;
    HYPRE_Int         xi1;
    HYPRE_Real       *xp1, *xp2, *xp3, *xp4;
+   HYPRE_Real       *d_xp1, *d_xp2, *d_xp3, *d_xp4;
    hypre_Index       loop_size, start, unit_stride, index;
 
    /*-----------------------------------------------------------
@@ -51,6 +50,8 @@ main( hypre_int argc,
    hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs );
    hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid );
 
+   HYPRE_Init();
+
    /*-----------------------------------------------------------
     * Set defaults
     *-----------------------------------------------------------*/
@@ -65,6 +66,8 @@ main( hypre_int argc,
    Q  = 1;
    R  = 1;
 
+   reps = -1;
+
    /*-----------------------------------------------------------
     * Parse command line
     *-----------------------------------------------------------*/
@@ -92,6 +95,11 @@ main( hypre_int argc,
          arg_index++;
          dim = atoi(argv[arg_index++]);
       }
+      else if ( strcmp(argv[arg_index], "-reps") == 0 )
+      {
+         arg_index++;
+         reps = atoi(argv[arg_index++]);
+      }
       else if ( strcmp(argv[arg_index], "-help") == 0 )
       {
          print_usage = 1;
@@ -230,7 +238,7 @@ main( hypre_int argc,
    hypre_MPI_Barrier(hypre_MPI_COMM_WORLD);
 
    /*-----------------------------------------------------------
-    * Time old boxloops
+    * Time old boxloops [Device]
     *-----------------------------------------------------------*/
 
    /* Time BoxLoop0 */
@@ -239,12 +247,14 @@ main( hypre_int argc,
    for (rep = 0; rep < reps; rep++)
    {
       xi1 = 0;
+#define DEVICE_VAR is_device_ptr(d_xp1)
       hypre_BoxLoop0Begin(3, loop_size);
       {
-         xp1[xi1] += xp1[xi1];
+         d_xp1[xi1] += d_xp1[xi1];
          //xi1++;
       }
       hypre_BoxLoop0End();
+#undef DEVICE_VAR
    }
    hypre_EndTiming(time_index);
 
@@ -253,12 +263,14 @@ main( hypre_int argc,
    hypre_BeginTiming(time_index);
    for (rep = 0; rep < reps; rep++)
    {
+#define DEVICE_VAR is_device_ptr(d_xp1)
       hypre_BoxLoop1Begin(3, loop_size,
                           x1_data_box, start, unit_stride, xi1);
       {
-         xp1[xi1] += xp1[xi1];
+         d_xp1[xi1] += d_xp1[xi1];
       }
       hypre_BoxLoop1End(xi1);
+#undef DEVICE_VAR
    }
    hypre_EndTiming(time_index);
 
@@ -267,13 +279,15 @@ main( hypre_int argc,
    hypre_BeginTiming(time_index);
    for (rep = 0; rep < reps; rep++)
    {
+#define DEVICE_VAR is_device_ptr(d_xp1,d_xp2)
       hypre_BoxLoop2Begin(3, loop_size,
                           x1_data_box, start, unit_stride, xi1,
                           x2_data_box, start, unit_stride, xi2);
       {
-         xp1[xi1] += xp1[xi1] + xp2[xi2];
+         d_xp1[xi1] += d_xp1[xi1] + d_xp2[xi2];
       }
       hypre_BoxLoop2End(xi1, xi2);
+#undef DEVICE_VAR
    }
    hypre_EndTiming(time_index);
 
@@ -282,14 +296,16 @@ main( hypre_int argc,
    hypre_BeginTiming(time_index);
    for (rep = 0; rep < reps; rep++)
    {
+#define DEVICE_VAR is_device_ptr(d_xp1,d_xp2,d_xp3)
       hypre_BoxLoop3Begin(3, loop_size,
                           x1_data_box, start, unit_stride, xi1,
                           x2_data_box, start, unit_stride, xi2,
                           x3_data_box, start, unit_stride, xi3);
       {
-         xp1[xi1] += xp1[xi1] + xp2[xi2] + xp3[xi3];
+         d_xp1[xi1] += d_xp1[xi1] + d_xp2[xi2] + d_xp3[xi3];
       }
       hypre_BoxLoop3End(xi1, xi2, xi3);
+#undef DEVICE_VAR
    }
    hypre_EndTiming(time_index);
 
@@ -298,24 +314,26 @@ main( hypre_int argc,
    hypre_BeginTiming(time_index);
    for (rep = 0; rep < reps; rep++)
    {
+#define DEVICE_VAR is_device_ptr(d_xp1,d_xp2,d_xp3,d_xp4)
       hypre_BoxLoop4Begin(3, loop_size,
                           x1_data_box, start, unit_stride, xi1,
                           x2_data_box, start, unit_stride, xi2,
                           x3_data_box, start, unit_stride, xi3,
                           x4_data_box, start, unit_stride, xi4);
       {
-         xp1[xi1] += xp1[xi1] + xp2[xi2] + xp3[xi3] + xp4[xi4];
+         d_xp1[xi1] += d_xp1[xi1] + d_xp2[xi2] + d_xp3[xi3] + d_xp4[xi4];
       }
       hypre_BoxLoop4End(xi1, xi2, xi3, xi4);
+#undef DEVICE_VAR
    }
    hypre_EndTiming(time_index);
 
-   hypre_PrintTiming("Old BoxLoop times", hypre_MPI_COMM_WORLD);
+   hypre_PrintTiming("Old BoxLoop times [DEVICE]", hypre_MPI_COMM_WORLD);
    hypre_FinalizeTiming(time_index);
    hypre_ClearTiming();
 
    /*-----------------------------------------------------------
-    * Time new boxloops
+    * Time new boxloops [Host]
     *-----------------------------------------------------------*/
 
    /* Time BoxLoop0 */
@@ -415,7 +433,7 @@ main( hypre_int argc,
    }
    hypre_EndTiming(time_index);
 
-   hypre_PrintTiming("New BoxLoop times", hypre_MPI_COMM_WORLD);
+   hypre_PrintTiming("New BoxLoop times [HOST]", hypre_MPI_COMM_WORLD);
    hypre_FinalizeTiming(time_index);
    hypre_ClearTiming();
 
@@ -427,11 +445,19 @@ main( hypre_int argc,
    hypre_BoxDestroy(x2_data_box);
    hypre_BoxDestroy(x3_data_box);
    hypre_BoxDestroy(x4_data_box);
+
    hypre_TFree(xp1, HYPRE_MEMORY_HOST);
    hypre_TFree(xp2, HYPRE_MEMORY_HOST);
    hypre_TFree(xp3, HYPRE_MEMORY_HOST);
    hypre_TFree(xp4, HYPRE_MEMORY_HOST);
 
+   hypre_TFree(d_xp1, HYPRE_MEMORY_DEVICE);
+   hypre_TFree(d_xp2, HYPRE_MEMORY_DEVICE);
+   hypre_TFree(d_xp3, HYPRE_MEMORY_DEVICE);
+   hypre_TFree(d_xp4, HYPRE_MEMORY_DEVICE);
+
+   HYPRE_Finalize();
+
    /* Finalize MPI */
    hypre_MPI_Finalize();
 
diff --git a/src/utilities/HYPRE_utilities.h b/src/utilities/HYPRE_utilities.h
index 5dc0ff6a10..6ac7ccd255 100644
--- a/src/utilities/HYPRE_utilities.h
+++ b/src/utilities/HYPRE_utilities.h
@@ -83,7 +83,15 @@ typedef double HYPRE_Real;
 #endif
 
 #if defined(HYPRE_COMPLEX)
-typedef double _Complex HYPRE_Complex;
+
+#if defined(HYPRE_USING_SYCL)
+  typedef std::complex<double> HYPRE_Complex;
+#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+  typedef thrust::complex<double> HYPRE_Complex;
+#else
+  typedef double _Complex HYPRE_Complex;
+#endif
+
 #define HYPRE_MPI_COMPLEX MPI_C_DOUBLE_COMPLEX  /* or MPI_LONG_DOUBLE ? */
 
 #else  /* default */
@@ -177,11 +185,15 @@ HYPRE_Int HYPRE_AssumedPartitionCheck();
  * HYPRE memory location
  *--------------------------------------------------------------------------*/
 
+// ABB: HYPRE_MEMORY_UNIFIED for the case of allocating SHARED memory
+//      specifically at some locations and everywhere as can be enabled
+//      with HYPRE_USING_UNIFIED_MEMORY macro
 typedef enum _HYPRE_MemoryLocation
 {
    HYPRE_MEMORY_UNDEFINED = -1,
    HYPRE_MEMORY_HOST,
-   HYPRE_MEMORY_DEVICE
+   HYPRE_MEMORY_DEVICE,
+   HYPRE_MEMORY_UNIFIED
 } HYPRE_MemoryLocation;
 
 HYPRE_Int HYPRE_SetMemoryLocation(HYPRE_MemoryLocation memory_location);
diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h
index 0df44e6bea..d26bf1927b 100644
--- a/src/utilities/_hypre_utilities.h
+++ b/src/utilities/_hypre_utilities.h
@@ -635,6 +635,11 @@ hypre_GetActualMemLocation(HYPRE_MemoryLocation location)
 #endif
    }
 
+   if (location == HYPRE_MEMORY_UNIFIED)
+   {
+      return hypre_MEMORY_UNIFIED;
+   }
+
    return hypre_MEMORY_UNDEFINED;
 }
 
@@ -1740,8 +1745,8 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR
                                            hypre_UnorderedBigIntMap *inverse_map);
 
 #if defined(HYPRE_USING_GPU)
-HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
-HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle);
+HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle);
+HYPRE_Int hypre_SyncDevice(hypre_Handle *hypre_handle);
 HYPRE_Int hypre_ResetCudaDevice(hypre_Handle *hypre_handle);
 HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
                                       HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y);
@@ -1772,10 +1777,10 @@ HYPRE_Int hypre_multmod(HYPRE_Int a, HYPRE_Int b, HYPRE_Int mod);
 void hypre_partition1D(HYPRE_Int n, HYPRE_Int p, HYPRE_Int j, HYPRE_Int *s, HYPRE_Int *e);
 char *hypre_strcpy(char *destination, const char *source);
 
-HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action);
-HYPRE_Int hypre_RestoreSyncCudaCompute();
-HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr);
-HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
+HYPRE_Int hypre_SetSyncDeviceCompute(HYPRE_Int action);
+HYPRE_Int hypre_RestoreSyncDeviceCompute();
+HYPRE_Int hypre_GetSyncDeviceCompute(HYPRE_Int *device_compute_stream_sync_ptr);
+HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle);
 
 /* handle.c */
 HYPRE_Int hypre_SetSpGemmUseCusparse( HYPRE_Int use_cusparse );
diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp
index 9bdf2d06a7..1a6f392e84 100644
--- a/src/utilities/_hypre_utilities.hpp
+++ b/src/utilities/_hypre_utilities.hpp
@@ -110,6 +110,11 @@ struct hypre_device_allocator
 
 #elif defined(HYPRE_USING_SYCL)
 
+typedef sycl::range<1> dim3;
+#define __global__
+#define __host__
+#define __device__
+
 /* WM: problems with this being inside extern C++ {} */
 /* #include <CL/sycl.hpp> */
 
@@ -392,17 +397,39 @@ struct hypre_GpuMatData
 #define hypre_GpuMatDataMatInfo(data)     ((data) -> mat_info)
 #define hypre_GpuMatDataSpMVBuffer(data)  ((data) -> spmv_buffer)
 
+/* device_utils.c, some common functions for CUDA, SYCL, HIP */
+
+dim3 hypre_GetDefaultDeviceBlockDimension();
+
+dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity,
+					  dim3 bDim );
+
+HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
+                                             HYPRE_Int *d_row_ind);
+
+HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
+
+HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
+
+HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
+                                             HYPRE_Int *d_row_ptr);
+
+HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i);
+
+HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
+
+HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
+
+template <typename T>
+HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
+                                                    HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind);
+
 #endif //#if defined(HYPRE_USING_GPU)
 
 #if defined(HYPRE_USING_SYCL)
 
 /* device_utils.c */
 HYPRE_Int HYPRE_SetSYCLDevice(sycl::device user_device);
-sycl::range<1> hypre_GetDefaultDeviceBlockDimension();
-
-sycl::range<1> hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity,
-                                                    sycl::range<1> bDim );
-
 #endif // #if defined(HYPRE_USING_SYCL)
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
@@ -443,15 +470,15 @@ using namespace thrust::placeholders;
 
 #if defined(HYPRE_DEBUG)
 #if defined(HYPRE_USING_CUDA)
-#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
+#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
 #elif defined(HYPRE_USING_HIP)
-#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() );  }
+#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() );  }
 #endif
 #else // #if defined(HYPRE_DEBUG)
 #define GPU_LAUNCH_SYNC
 #endif // defined(HYPRE_DEBUG)
 
-#define HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)                                                 \
+#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)                                                 \
 {                                                                                                                             \
    if ( gridsize.x  == 0 || gridsize.y  == 0 || gridsize.z  == 0 ||                                                           \
         blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 )                                                            \
@@ -467,7 +494,7 @@ using namespace thrust::placeholders;
    }                                                                                                                          \
 }
 
-#define HYPRE_CUDA_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
+#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
 
 /* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right
  * The following one works OK for now */
@@ -1002,10 +1029,6 @@ hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *val
 template <typename T1, typename T2, typename T3> HYPRE_Int hypreDevice_ReduceByTupleKey(HYPRE_Int N,
                                                                                         T1 *keys1_in,  T2 *keys2_in,  T3 *vals_in, T1 *keys1_out, T2 *keys2_out, T3 *vals_out);
 
-template <typename T>
-HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
-                                                    HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind);
-
 template <typename T>
 HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v);
 
@@ -1017,22 +1040,6 @@ HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices,
                                      HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j,
                                      HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab);
 
-HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i);
-
-HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
-
-HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
-
-HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
-
-HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
-                                             HYPRE_Int *d_row_ind);
-
-HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
-
-HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
-                                             HYPRE_Int *d_row_ptr);
-
 HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y,
                                     char *work);
 
@@ -1058,6 +1065,482 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
 
 #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(HYPRE_USING_SYCL)
+
+#pragma once
+
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/iterator>
+#include <oneapi/dpl/functional>
+
+#include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if, scatter_if
+
+#include <algorithm>
+#include <numeric>
+#include <functional>
+#include <iterator>
+
+#define __forceinline__ __inline__ __attribute__((always_inline))
+
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ * macro for launching SYCL kernels, SYCL, oneDPL, oneMKL calls
+ *                    NOTE: IN HYPRE'S DEFAULT STREAM
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ */
+
+template <typename InputIter1, typename InputIter2,
+          typename OutputIter>
+OutputIter hypreSycl_gather(InputIter1 map_first, InputIter1 map_last,
+                  InputIter2 input_first, OutputIter result) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<InputIter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<InputIter2>::iterator_category,
+              std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<OutputIter>::iterator_category,
+              std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto perm_begin =
+      oneapi::dpl::make_permutation_iterator(input_first, map_first);
+  const int n = ::std::distance(map_first, map_last);
+
+  return oneapi::dpl::copy(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())),
+			   perm_begin, perm_begin + n, result);
+}
+
+#if defined(HYPRE_DEBUG)
+#if defined(HYPRE_USING_CUDA)
+#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
+#endif
+#else // #if defined(HYPRE_DEBUG)
+#define GPU_LAUNCH_SYNC
+#endif // defined(HYPRE_DEBUG)
+
+#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...)                              \
+{                                                                                            \
+   if ( gridsize[0] == 0 || blocksize[0] == 0 )                                              \
+   {                                                                                         \
+     hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n",   \
+                  __FILE__, __LINE__,                                                        \
+                  gridsize[0], blocksize[0]);                                                \
+     assert(0); exit(1);                                                                     \
+   }                                                                                         \
+   else                                                                                      \
+   {                                                                                         \
+     hypre_HandleComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \
+        [=] (sycl::nd_item<1> item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]] {        \
+           (kernel_name)(item, __VA_ARGS__);                                                 \
+     });                                                                                     \
+   }                                                                                         \
+}
+
+/* RL: TODO Want macro HYPRE_ONEDPL_CALL to return value but I don't know how to do it right
+ * The following one works OK for now */
+
+#define HYPRE_ONEDPL_CALL(func_name, ...) \
+  func_name(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
+
+// /* return the number of threads in block */
+// template <hypre_int dim>
+// static __forceinline__
+// hypre_int hypre_gpu_get_num_threads()
+// {
+//    switch (dim)
+//    {
+//       case 1:
+//          return (blockDim.x);
+//       case 2:
+//          return (blockDim.x * blockDim.y);
+//       case 3:
+//          return (blockDim.x * blockDim.y * blockDim.z);
+//    }
+
+//    return -1;
+// }
+
+/* return the number of (sub_groups) warps in (work-group) block */
+template <hypre_int dim>
+static __forceinline__
+hypre_int hypre_gpu_get_num_warps(sycl::nd_item<dim>& item)
+{
+   return item.get_sub_group().get_group_range().get(0);
+}
+
+/* return the thread lane id in warp */
+template <hypre_int dim>
+static __forceinline__
+hypre_int hypre_gpu_get_lane_id(sycl::nd_item<dim>& item)
+{
+   return item.get_sub_group().get_local_linear_id();
+}
+
+// /* return the number of threads in grid */
+// template <hypre_int bdim, hypre_int gdim>
+// static __forceinline__
+// hypre_int hypre_gpu_get_grid_num_threads()
+// {
+//    return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_threads<bdim>();
+// }
+
+/* return the flattened work-item/thread id in global work space,
+ * Note: Since the use-cases always involved bdim = gdim = 1, the
+ * sycl:;nd_item<1> is only being used. SFINAE is used to prevent
+ * other dimensions (i.e., bdim != gdim != 1) */
+template < hypre_int bdim, hypre_int gdim >
+static __forceinline__
+hypre_int hypre_gpu_get_grid_thread_id(sycl::nd_item<1>& item)
+{
+   static_assert(bdim == 1 && gdim == 1);
+   return item.get_global_id(0);
+}
+
+// /* return the number of warps in grid */
+// template <hypre_int bdim, hypre_int gdim>
+// static __forceinline__
+// hypre_int hypre_gpu_get_grid_num_warps()
+// {
+//    return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_warps<bdim>();
+// }
+
+/* return the flattened warp id in grid */
+template <hypre_int bdim, hypre_int gdim>
+static __forceinline__
+hypre_int hypre_gpu_get_grid_warp_id(sycl::nd_item<1>& item)
+{
+  return item.get_group(0) * hypre_gpu_get_num_warps<bdim>(item) +
+     item.get_sub_group().get_group_linear_id();
+}
+
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+// static __forceinline__
+// hypre_double atomicAdd(hypre_double* address, hypre_double val)
+// {
+//     hypre_ulonglongint* address_as_ull = (hypre_ulonglongint*) address;
+//     hypre_ulonglongint old = *address_as_ull, assumed;
+
+//     do {
+//         assumed = old;
+//         old = atomicCAS(address_as_ull, assumed,
+//                         __double_as_longlong(val +
+//                                __longlong_as_double(assumed)));
+
+//     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+//     } while (assumed != old);
+
+//     return __longlong_as_double(old);
+// }
+// #endif
+
+template <typename T>
+static __forceinline__
+T read_only_load( const T *ptr )
+{
+   return *ptr;
+}
+
+// /* exclusive prefix scan */
+// template <typename T>
+// static __forceinline__
+// T warp_prefix_sum(hypre_int lane_id, T in, T &all_sum)
+// {
+// #pragma unroll
+//    for (hypre_int d = 2; d <=HYPRE_WARP_SIZE; d <<= 1)
+//    {
+//       T t = __shfl_up_sync(HYPRE_WARP_FULL_MASK, in, d >> 1);
+//       if ( (lane_id & (d - 1)) == (d - 1) )
+//       {
+//          in += t;
+//       }
+//    }
+
+//    all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, HYPRE_WARP_SIZE-1);
+
+//    if (lane_id == HYPRE_WARP_SIZE-1)
+//    {
+//       in = 0;
+//    }
+
+// #pragma unroll
+//    for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//    {
+//       T t = __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);
+
+//       if ( (lane_id & (d - 1)) == (d - 1))
+//       {
+//         if ( (lane_id & ((d << 1) - 1)) == ((d << 1) - 1) )
+//          {
+//             in += t;
+//          }
+//          else
+//          {
+//             in = t;
+//          }
+//       }
+//    }
+//    return in;
+// }
+
+template <typename T, int DIM>
+static __forceinline__
+T warp_reduce_sum(T in, sycl::nd_item<DIM>& item)
+{
+  sycl::sub_group SG = item.get_sub_group();
+  //sycl::ext::oneapi::reduce(SG, in, std::plus<T>());
+#pragma unroll
+  for (hypre_int d = SG.get_local_range().get(0)/2; d > 0; d >>= 1)
+  {
+    in += SG.shuffle_down(in, d);
+  }
+  return in;  
+}
+
+// template <typename T>
+// static __forceinline__
+// T warp_allreduce_sum(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in += __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_reduce_max(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = max(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_allreduce_max(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = max(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_reduce_min(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = min(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_allreduce_min(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = min(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// static __forceinline__
+// hypre_int next_power_of_2(hypre_int n)
+// {
+//    if (n <= 0)
+//    {
+//       return 0;
+//    }
+
+//    /* if n is power of 2, return itself */
+//    if ( (n & (n - 1)) == 0 )
+//    {
+//       return n;
+//    }
+
+//    n |= (n >>  1);
+//    n |= (n >>  2);
+//    n |= (n >>  4);
+//    n |= (n >>  8);
+//    n |= (n >> 16);
+//    n ^= (n >>  1);
+//    n  = (n <<  1);
+
+//    return n;
+// }
+
+// template<typename T>
+// struct absolute_value : public thrust::unary_function<T,T>
+// {
+//   T operator()(const T &x) const
+//   {
+//     return x < T(0) ? -x : x;
+//   }
+// };
+
+// template<typename T1, typename T2>
+// struct TupleComp2
+// {
+//    typedef thrust::tuple<T1, T2> Tuple;
+
+//    bool operator()(const Tuple& t1, const Tuple& t2)
+//    {
+//       if (thrust::get<0>(t1) < thrust::get<0>(t2))
+//       {
+//          return true;
+//       }
+//       if (thrust::get<0>(t1) > thrust::get<0>(t2))
+//       {
+//          return false;
+//       }
+//       return hypre_abs(thrust::get<1>(t1)) > hypre_abs(thrust::get<1>(t2));
+//    }
+// };
+
+// template<typename T1, typename T2>
+// struct TupleComp3
+// {
+//    typedef thrust::tuple<T1, T2> Tuple;
+
+//    bool operator()(const Tuple& t1, const Tuple& t2)
+//    {
+//       if (thrust::get<0>(t1) < thrust::get<0>(t2))
+//       {
+//          return true;
+//       }
+//       if (thrust::get<0>(t1) > thrust::get<0>(t2))
+//       {
+//          return false;
+//       }
+//       if (thrust::get<0>(t2) == thrust::get<1>(t2))
+//       {
+//          return false;
+//       }
+//       return thrust::get<0>(t1) == thrust::get<1>(t1) || thrust::get<1>(t1) < thrust::get<1>(t2);
+//    }
+// };
+
+// template<typename T>
+// struct is_negative : public thrust::unary_function<T,bool>
+// {
+//    bool operator()(const T &x)
+//    {
+//       return (x < 0);
+//    }
+// };
+
+// template<typename T>
+// struct is_positive : public thrust::unary_function<T,bool>
+// {
+//    bool operator()(const T &x)
+//    {
+//       return (x > 0);
+//    }
+// };
+
+// template<typename T>
+// struct is_nonnegative : public thrust::unary_function<T,bool>
+// {
+//    bool operator()(const T &x)
+//    {
+//       return (x >= 0);
+//    }
+// };
+
+template<typename T>
+struct in_range
+{
+   T low, up;
+   in_range(T low_, T up_) { low = low_; up = up_; }
+
+   bool operator()(const T &x) const { return (x >= low && x <= up); }
+};
+
+// template<typename T>
+// struct out_of_range : public thrust::unary_function<T,bool>
+// {
+//    T low, up;
+
+//    out_of_range(T low_, T up_) { low = low_; up = up_; }
+
+//    bool operator()(const T &x)
+//    {
+//       return (x < low || x > up);
+//    }
+// };
+
+#ifdef HYPRE_COMPLEX
+template<typename T,
+	 typename = typename std::enable_if<std::is_same<T, HYPRE_Complex>::value>::type>
+struct less_than
+{
+  T val;
+  less_than(T val_) { val = val_; }
+  bool operator()(const T &x) const { return (hypre_abs(x) < hypre_abs(val)); }
+};
+#else
+template<typename T,
+	 typename = typename std::enable_if<std::is_same<T, HYPRE_Real>::value>::type>
+struct less_than
+{
+  T val;
+  less_than(T val_) { val = val_; }
+  bool operator()(const T &x) const { return (x < val); }
+};
+#endif
+// template<typename T>
+// struct modulo : public thrust::unary_function<T,T>
+// {
+//    T val;
+
+//    modulo(T val_) { val = val_; }
+
+//    T operator()(const T &x)
+//    {
+//       return (x % val);
+//    }
+// };
+
+// template<typename T>
+// struct equal : public thrust::unary_function<T,bool>
+// {
+//    T val;
+
+//    equal(T val_) { val = val_; }
+
+//    bool operator()(const T &x)
+//    {
+//       return (x == val);
+//    }
+// };
+
+// struct print_functor
+// {
+//    void operator()(HYPRE_Real val)
+//    {
+//       printf("%f\n", val);
+//    }
+// };
+
+#endif // #if defined(HYPRE_USING_SYCL)
+
+////////////////////////////////////////////////////////////////////////////////////////
+
 #if defined(HYPRE_USING_CUSPARSE)
 
 cudaDataType hypre_HYPREComplexToCudaDataType();
@@ -1336,7 +1819,7 @@ struct ReduceSum
          /* 2nd reduction with only *one* block */
          hypre_assert(nblocks >= 0 && nblocks <= 1024);
          const dim3 gDim(1), bDim(1024);
-         HYPRE_CUDA_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
+         HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
          hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
          val += init;
       }
diff --git a/src/utilities/complex.c b/src/utilities/complex.c
index eb8dca4f38..59b71bbf56 100644
--- a/src/utilities/complex.c
+++ b/src/utilities/complex.c
@@ -9,30 +9,52 @@
 
 #ifdef HYPRE_COMPLEX
 
-#include <complex.h>
-
 HYPRE_Complex
 hypre_conj( HYPRE_Complex value )
 {
-   return conj(value);
+#ifdef HYPRE_USING_SYCL
+  return std::conj(value);
+#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+  return thrust::conj(value);
+#else
+  return conj(value);
+#endif
 }
 
 HYPRE_Real
 hypre_cabs( HYPRE_Complex value )
 {
-   return cabs(value);
+#ifdef HYPRE_USING_SYCL
+  return std::abs(value);
+#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+  return thrust::abs(value);
+#else
+  return cabs(value);
+#endif
 }
 
 HYPRE_Real
 hypre_creal( HYPRE_Complex value )
 {
-   return creal(value);
+#ifdef HYPRE_USING_SYCL
+  return std::real(value);
+#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+  return thrust::real(value);
+#else
+  return creal(value);
+#endif
 }
 
 HYPRE_Real
 hypre_cimag( HYPRE_Complex value )
 {
-   return cimag(value);
+#ifdef HYPRE_USING_SYCL
+  return std::imag(value);
+#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
+  return thrust::imag(value);
+#else
+  return cimag(value);
+#endif
 }
 
-#endif
+#endif // HYPRE_COMPLEX
diff --git a/src/utilities/device_reducer.h b/src/utilities/device_reducer.h
index ed8604e92b..e62d90d213 100644
--- a/src/utilities/device_reducer.h
+++ b/src/utilities/device_reducer.h
@@ -267,7 +267,7 @@ struct ReduceSum
          /* 2nd reduction with only *one* block */
          hypre_assert(nblocks >= 0 && nblocks <= 1024);
          const dim3 gDim(1), bDim(1024);
-         HYPRE_CUDA_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
+         HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
          hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
          val += init;
       }
diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c
index f00a23415f..a75f9be6d2 100644
--- a/src/utilities/device_utils.c
+++ b/src/utilities/device_utils.c
@@ -8,16 +8,185 @@
 #include "_hypre_utilities.h"
 #include "_hypre_utilities.hpp"
 
+// some common kernels for CUDA, HIP and SYCL
+#ifdef HYPRE_USING_GPU
+
+/**
+ * Get NNZ of each row in d_row_indices and stored the results in d_rownnz
+ * All pointers are device pointers.
+ * d_rownnz can be the same as d_row_indices
+ */
+__global__ void
+hypreGPUKernel_GetRowNnz(
+  #ifdef HYPRE_USING_SYCL
+  sycl::nd_item<1> item,
+  #endif
+  HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia,
+  HYPRE_Int *d_offd_ia,
+  HYPRE_Int *d_rownnz)
+{
+#if defined(HYPRE_USING_SYCL)
+   const HYPRE_Int global_thread_id = hypre_gpu_get_grid_thread_id<1,1>(item);
+#else
+   const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1, 1>();
+#endif
+
+   if (global_thread_id < nrows)
+   {
+      HYPRE_Int i;
+
+      if (d_row_indices)
+      {
+         i = read_only_load(&d_row_indices[global_thread_id]);
+      }
+      else
+      {
+         i = global_thread_id;
+      }
+
+      d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i + 1]) - read_only_load(&d_diag_ia[i]) +
+                                   read_only_load(&d_offd_ia[i + 1]) - read_only_load(&d_offd_ia[i]);
+   }
+}
+
+HYPRE_Int*
+hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind)
+{
+   HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE);
+
+#if defined(HYPRE_USING_SYCL)
+   HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
+                      d_row_ind, d_row_ind + nnz,
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(0),
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(nrows + 1),
+                      d_row_ptr);
+#else
+   HYPRE_THRUST_CALL( lower_bound,
+                      d_row_ind, d_row_ind + nnz,
+                      thrust::counting_iterator<HYPRE_Int>(0),
+                      thrust::counting_iterator<HYPRE_Int>(nrows + 1),
+                      d_row_ptr);
+#endif
+
+   return d_row_ptr;
+}
+
+HYPRE_Int
+hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
+                                   HYPRE_Int *d_row_ptr)
+{
+#if defined(HYPRE_USING_SYCL)
+   HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound,
+                      d_row_ind, d_row_ind + nnz,
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(0),
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(nrows + 1),
+                      d_row_ptr);
+#else
+   HYPRE_THRUST_CALL( lower_bound,
+                      d_row_ind, d_row_ind + nnz,
+                      thrust::counting_iterator<HYPRE_Int>(0),
+                      thrust::counting_iterator<HYPRE_Int>(nrows + 1),
+                      d_row_ptr);
+#endif
+
+   return hypre_error_flag;
+}
+
+HYPRE_Int*
+hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr)
+{
+   /* trivial case */
+   if (nrows <= 0 || nnz <= 0)
+   {
+      return NULL;
+   }
+
+   HYPRE_Int *d_row_ind = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
+
+   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, d_row_ind);
+
+   return d_row_ind;
+}
+
+HYPRE_Int
+hypreDevice_IntegerReduceSum(HYPRE_Int n, HYPRE_Int *d_i)
+{
+#ifdef HYPRE_USING_SYCL
+   return HYPRE_ONEDPL_CALL(oneapi::dpl::reduce, d_i, d_i + n);
+#else
+   return HYPRE_THRUST_CALL(reduce, d_i, d_i + n);
+#endif
+}
+
+HYPRE_Int
+hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
+{
 #if defined(HYPRE_USING_SYCL)
-sycl::range<1> hypre_GetDefaultDeviceBlockDimension()
+   HYPRE_ONEDPL_CALL(oneapi::dpl::inclusive_scan, d_i, d_i + n, d_i);
+#else
+   HYPRE_THRUST_CALL(inclusive_scan, d_i, d_i + n, d_i);
+#endif
+   return hypre_error_flag;
+}
+
+HYPRE_Int
+hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
+{
+#if defined(HYPRE_USING_SYCL)
+   HYPRE_ONEDPL_CALL(std::exclusive_scan, d_i, d_i + n, d_i, 0, std::plus<>());
+#else
+   HYPRE_THRUST_CALL(exclusive_scan, d_i, d_i + n, d_i);
+#endif
+   return hypre_error_flag;
+}
+
+/* Input: d_row_num, of size nrows, contains the rows indices that can be BigInt or Int
+ * Output: d_row_ind */
+template <typename T>
+HYPRE_Int
+hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
+                                          T *d_row_num, T *d_row_ind)
+{
+   /* trivial case */
+   if (nrows <= 0)
+   {
+      return hypre_error_flag;
+   }
+
+   HYPRE_Int *map = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
+
+   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, map);
+
+#ifdef HYPRE_USING_SYCL
+   hypreSycl_gather(map, map + nnz, d_row_num, d_row_ind);
+#else
+   HYPRE_THRUST_CALL(gather, map, map + nnz, d_row_num, d_row_ind);
+#endif
+
+   hypre_TFree(map, HYPRE_MEMORY_DEVICE);
+
+   return hypre_error_flag;
+}
+
+template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
+                                                             HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind);
+#if defined(HYPRE_MIXEDINT)
+template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
+                                                             HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind);
+#endif
+
+#endif // HYPRE_USING_GPU
+
+#if defined(HYPRE_USING_SYCL)
+dim3 hypre_GetDefaultDeviceBlockDimension()
 {
    sycl::range<1> wgDim(hypre_HandleDeviceMaxWorkGroupSize(hypre_handle()));
    return wgDim;
 }
 
-sycl::range<1> hypre_GetDefaultDeviceGridDimension(HYPRE_Int n,
-                                                   const char *granularity,
-                                                   sycl::range<1> wgDim)
+dim3 hypre_GetDefaultDeviceGridDimension(HYPRE_Int n,
+					 const char *granularity,
+					 sycl::range<1> wgDim)
 {
    HYPRE_Int num_WGs = 0;
    HYPRE_Int num_workitems_per_WG = wgDim[0];
@@ -42,7 +211,45 @@ sycl::range<1> hypre_GetDefaultDeviceGridDimension(HYPRE_Int n,
 
    return gDim;
 }
-#endif
+
+struct hypre_empty_row_functor
+{
+   bool operator()(const std::tuple<HYPRE_Int, HYPRE_Int>& t) const
+   {
+      const HYPRE_Int a = std::get<0>(t);
+      const HYPRE_Int b = std::get<1>(t);
+      return a != b;
+   }
+};
+
+HYPRE_Int
+hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
+                                   HYPRE_Int *d_row_ind)
+{
+   /* trivial case */
+   if (nrows <= 0 || nnz <= 0)
+   {
+      return hypre_error_flag;
+   }
+
+   HYPRE_ONEDPL_CALL( std::fill, d_row_ind, d_row_ind + nnz, 0 );
+
+   HYPRE_ONEDPL_CALL( dpct::scatter_if,
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(0),
+                      oneapi::dpl::counting_iterator<HYPRE_Int>(nrows),
+                      d_row_ptr,
+                      oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1),
+                                                            hypre_empty_row_functor() ),
+                      d_row_ind,
+                      oneapi::dpl::identity() );
+
+   HYPRE_ONEDPL_CALL( oneapi::dpl::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind,
+                      sycl::maximum<HYPRE_Int>() );
+
+   return hypre_error_flag;
+}
+
+#endif // HYPRE_USING_SYCL
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
@@ -84,7 +291,7 @@ void hypre_CudaCompileFlagCheck()
    HYPRE_CUDA_CALL( cudaMalloc(&cuda_arch_compile_d, sizeof(hypre_int)) );
    hypre_TMemcpy(cuda_arch_compile_d, &cuda_arch_compile, hypre_int, 1, HYPRE_MEMORY_DEVICE,
                  HYPRE_MEMORY_HOST);
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d );
    hypre_TMemcpy(&cuda_arch_compile, cuda_arch_compile_d, hypre_int, 1, HYPRE_MEMORY_HOST,
                  HYPRE_MEMORY_DEVICE);
    //hypre_TFree(cuda_arch_compile_d, HYPRE_MEMORY_DEVICE);
@@ -150,36 +357,6 @@ hypre_GetDefaultDeviceGridDimension( HYPRE_Int n,
    return gDim;
 }
 
-/**
- * Get NNZ of each row in d_row_indices and stored the results in d_rownnz
- * All pointers are device pointers.
- * d_rownnz can be the same as d_row_indices
- */
-__global__ void
-hypreCUDAKernel_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia,
-                          HYPRE_Int *d_offd_ia,
-                          HYPRE_Int *d_rownnz)
-{
-   const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1, 1>();
-
-   if (global_thread_id < nrows)
-   {
-      HYPRE_Int i;
-
-      if (d_row_indices)
-      {
-         i = read_only_load(&d_row_indices[global_thread_id]);
-      }
-      else
-      {
-         i = global_thread_id;
-      }
-
-      d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i + 1]) - read_only_load(&d_diag_ia[i]) +
-                                   read_only_load(&d_offd_ia[i + 1]) - read_only_load(&d_offd_ia[i]);
-   }
-}
-
 /* special case: if d_row_indices == NULL, it means d_row_indices=[0,1,...,nrows-1] */
 HYPRE_Int
 hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia,
@@ -195,7 +372,7 @@ hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_di
       return hypre_error_flag;
    }
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia,
+   HYPRE_GPU_LAUNCH( hypreGPUKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia,
                       d_offd_ia, d_rownnz );
 
    return hypre_error_flag;
@@ -335,7 +512,7 @@ hypreDevice_CopyParCSRRows(HYPRE_Int      nrows,
    }
    */
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim,
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim,
                       nrows, d_row_indices, has_offd, first_col, d_col_map_offd_A,
                       d_diag_i, d_diag_j, d_diag_a,
                       d_offd_i, d_offd_j, d_offd_a,
@@ -344,28 +521,6 @@ hypreDevice_CopyParCSRRows(HYPRE_Int      nrows,
    return hypre_error_flag;
 }
 
-HYPRE_Int
-hypreDevice_IntegerReduceSum(HYPRE_Int n, HYPRE_Int *d_i)
-{
-   return HYPRE_THRUST_CALL(reduce, d_i, d_i + n);
-}
-
-HYPRE_Int
-hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
-{
-   HYPRE_THRUST_CALL(inclusive_scan, d_i, d_i + n, d_i);
-
-   return hypre_error_flag;
-}
-
-HYPRE_Int
-hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i)
-{
-   HYPRE_THRUST_CALL(exclusive_scan, d_i, d_i + n, d_i);
-
-   return hypre_error_flag;
-}
-
 HYPRE_Int
 hypreDevice_Scalen(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v)
 {
@@ -405,22 +560,6 @@ struct hypre_empty_row_functor
    }
 };
 
-HYPRE_Int*
-hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr)
-{
-   /* trivial case */
-   if (nrows <= 0 || nnz <= 0)
-   {
-      return NULL;
-   }
-
-   HYPRE_Int *d_row_ind = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
-
-   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, d_row_ind);
-
-   return d_row_ind;
-}
-
 HYPRE_Int
 hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
                                    HYPRE_Int *d_row_ind)
@@ -448,64 +587,6 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_
    return hypre_error_flag;
 }
 
-/* Input: d_row_num, of size nrows, contains the rows indices that can be BigInt or Int
- * Output: d_row_ind */
-template <typename T>
-HYPRE_Int
-hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
-                                          T *d_row_num, T *d_row_ind)
-{
-   /* trivial case */
-   if (nrows <= 0)
-   {
-      return hypre_error_flag;
-   }
-
-   HYPRE_Int *map = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE);
-
-   hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, map);
-
-   HYPRE_THRUST_CALL(gather, map, map + nnz, d_row_num, d_row_ind);
-
-   hypre_TFree(map, HYPRE_MEMORY_DEVICE);
-
-   return hypre_error_flag;
-}
-
-template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
-                                                             HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind);
-#if defined(HYPRE_MIXEDINT)
-template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
-                                                             HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind);
-#endif
-
-HYPRE_Int*
-hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind)
-{
-   HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE);
-
-   HYPRE_THRUST_CALL( lower_bound,
-                      d_row_ind, d_row_ind + nnz,
-                      thrust::counting_iterator<HYPRE_Int>(0),
-                      thrust::counting_iterator<HYPRE_Int>(nrows + 1),
-                      d_row_ptr);
-
-   return d_row_ptr;
-}
-
-HYPRE_Int
-hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
-                                   HYPRE_Int *d_row_ptr)
-{
-   HYPRE_THRUST_CALL( lower_bound,
-                      d_row_ind, d_row_ind + nnz,
-                      thrust::counting_iterator<HYPRE_Int>(0),
-                      thrust::counting_iterator<HYPRE_Int>(nrows + 1),
-                      d_row_ptr);
-
-   return hypre_error_flag;
-}
-
 __global__ void
 hypreCUDAKernel_ScatterAddTrivial(HYPRE_Int n, HYPRE_Real *x, HYPRE_Int *map, HYPRE_Real *y)
 {
@@ -546,7 +627,7 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea
       /* trivial cases, n = 1, 2 */
       dim3 bDim = 1;
       dim3 gDim = 1;
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y );
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y );
    }
    else
    {
@@ -585,7 +666,7 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea
       dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
       dim3 gDim = hypre_GetDefaultDeviceGridDimension(reduced_n, "thread", bDim);
 
-      HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim,
+      HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim,
                          reduced_n, x, reduced_map, reduced_y );
 
       if (!work)
@@ -628,7 +709,7 @@ hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v)
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v );
 
    return hypre_error_flag;
 }
@@ -662,7 +743,7 @@ hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Comple
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y );
 
    return hypre_error_flag;
 }
@@ -696,7 +777,7 @@ hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val );
 
    return hypre_error_flag;
 }
@@ -735,7 +816,7 @@ hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y );
 
    return hypre_error_flag;
 }
@@ -770,7 +851,7 @@ hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z );
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z );
 
    return hypre_error_flag;
 }
@@ -794,7 +875,7 @@ hypreDevice_BigToSmallCopy(HYPRE_Int *tgt, const HYPRE_BigInt *src, HYPRE_Int si
    dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
    dim3 gDim = hypre_GetDefaultDeviceGridDimension(size, "thread", bDim);
 
-   HYPRE_CUDA_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size);
+   HYPRE_GPU_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size);
 
    return hypre_error_flag;
 }
@@ -1373,7 +1454,7 @@ hypre_DeviceDataDestroy(hypre_DeviceData *data)
 }
 
 HYPRE_Int
-hypre_SyncCudaDevice(hypre_Handle *hypre_handle)
+hypre_SyncDevice(hypre_Handle *hypre_handle)
 {
 #if defined(HYPRE_USING_DEVICE_OPENMP)
    HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
@@ -1381,6 +1462,8 @@ hypre_SyncCudaDevice(hypre_Handle *hypre_handle)
    HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
 #elif defined(HYPRE_USING_HIP)
    HYPRE_HIP_CALL( hipDeviceSynchronize() );
+#elif defined(HYPRE_USING_SYCL)
+   HYPRE_SYCL_CALL( hypre_HandleComputeStream(hypre_handle)->wait_and_throw() );
 #endif
    return hypre_error_flag;
 }
@@ -1400,55 +1483,57 @@ hypre_ResetCudaDevice(hypre_Handle *hypre_handle)
  * action: 0: set sync stream to false
  *         1: set sync stream to true
  *         2: restore sync stream to default
- *         3: return the current value of cuda_compute_stream_sync
- *         4: sync stream based on cuda_compute_stream_sync
+ *         3: return the current value of device_compute_stream_sync
+ *         4: sync stream based on device_compute_stream_sync
  */
 HYPRE_Int
-hypre_SyncCudaComputeStream_core(HYPRE_Int     action,
-                                 hypre_Handle *hypre_handle,
-                                 HYPRE_Int    *cuda_compute_stream_sync_ptr)
+hypre_SyncDeviceComputeStream_core(HYPRE_Int     action,
+                                   hypre_Handle *hypre_handle,
+                                   HYPRE_Int    *device_compute_stream_sync_ptr)
 {
    /* with UVM the default is to sync at kernel completions, since host is also able to
     * touch GPU memory */
 #if defined(HYPRE_USING_UNIFIED_MEMORY)
-   static const HYPRE_Int cuda_compute_stream_sync_default = 1;
+   static const HYPRE_Int device_compute_stream_sync_default = 1;
 #else
-   static const HYPRE_Int cuda_compute_stream_sync_default = 0;
+   static const HYPRE_Int device_compute_stream_sync_default = 0;
 #endif
 
    /* this controls if synchronize the stream after computations */
-   static HYPRE_Int cuda_compute_stream_sync = cuda_compute_stream_sync_default;
+   static HYPRE_Int device_compute_stream_sync = device_compute_stream_sync_default;
 
    switch (action)
    {
       case 0:
-         cuda_compute_stream_sync = 0;
+         device_compute_stream_sync = 0;
          break;
       case 1:
-         cuda_compute_stream_sync = 1;
+         device_compute_stream_sync = 1;
          break;
       case 2:
-         cuda_compute_stream_sync = cuda_compute_stream_sync_default;
+         device_compute_stream_sync = device_compute_stream_sync_default;
          break;
       case 3:
-         *cuda_compute_stream_sync_ptr = cuda_compute_stream_sync;
+         *device_compute_stream_sync_ptr = device_compute_stream_sync;
          break;
       case 4:
 #if defined(HYPRE_USING_DEVICE_OPENMP)
          HYPRE_CUDA_CALL( cudaDeviceSynchronize() );
 #else
-         if (cuda_compute_stream_sync)
+         if (device_compute_stream_sync)
          {
 #if defined(HYPRE_USING_CUDA)
             HYPRE_CUDA_CALL( cudaStreamSynchronize(hypre_HandleComputeStream(hypre_handle)) );
 #elif defined(HYPRE_USING_HIP)
             HYPRE_HIP_CALL( hipStreamSynchronize(hypre_HandleComputeStream(hypre_handle)) );
+#elif defined(HYPRE_USING_SYCL)
+            HYPRE_SYCL_CALL( hypre_HandleComputeStream(hypre_handle)->ext_oneapi_submit_barrier() );
 #endif
          }
 #endif
          break;
       default:
-         hypre_printf("hypre_SyncCudaComputeStream_core invalid action\n");
+         hypre_printf("hypre_SyncDeviceComputeStream_core invalid action\n");
          hypre_error_in_arg(1);
    }
 
@@ -1456,35 +1541,35 @@ hypre_SyncCudaComputeStream_core(HYPRE_Int     action,
 }
 
 HYPRE_Int
-hypre_SetSyncCudaCompute(HYPRE_Int action)
+hypre_SetSyncDeviceCompute(HYPRE_Int action)
 {
    /* convert to 1/0 */
    action = action != 0;
-   hypre_SyncCudaComputeStream_core(action, NULL, NULL);
+   hypre_SyncDeviceComputeStream_core(action, NULL, NULL);
 
    return hypre_error_flag;
 }
 
 HYPRE_Int
-hypre_RestoreSyncCudaCompute()
+hypre_RestoreSyncDeviceCompute()
 {
-   hypre_SyncCudaComputeStream_core(2, NULL, NULL);
+   hypre_SyncDeviceComputeStream_core(2, NULL, NULL);
 
    return hypre_error_flag;
 }
 
 HYPRE_Int
-hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr)
+hypre_GetSyncDeviceCompute(HYPRE_Int *device_compute_stream_sync_ptr)
 {
-   hypre_SyncCudaComputeStream_core(3, NULL, cuda_compute_stream_sync_ptr);
+   hypre_SyncDeviceComputeStream_core(3, NULL, device_compute_stream_sync_ptr);
 
    return hypre_error_flag;
 }
 
 HYPRE_Int
-hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle)
+hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle)
 {
-   hypre_SyncCudaComputeStream_core(4, hypre_handle, NULL);
+   hypre_SyncDeviceComputeStream_core(4, hypre_handle, NULL);
 
    return hypre_error_flag;
 }
@@ -1550,6 +1635,8 @@ hypre_bind_device( HYPRE_Int myid,
 
    /* get number of devices on this node */
    hypre_GetDeviceCount(&nDevices);
+   /* TODO: ABB might need to look into this since nDevices are overwritten by 1 */
+   nDevices = 1;
 
    /* set device */
    device_id = myNodeid % nDevices;
@@ -1564,4 +1651,3 @@ hypre_bind_device( HYPRE_Int myid,
 
    return hypre_error_flag;
 }
-
diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h
index c019413d85..45006f9097 100644
--- a/src/utilities/device_utils.h
+++ b/src/utilities/device_utils.h
@@ -53,6 +53,11 @@
 
 #elif defined(HYPRE_USING_SYCL)
 
+typedef sycl::range<1> dim3;
+#define __global__
+#define __host__
+#define __device__
+
 /* WM: problems with this being inside extern C++ {} */
 /* #include <CL/sycl.hpp> */
 
@@ -335,17 +340,39 @@ struct hypre_GpuMatData
 #define hypre_GpuMatDataMatInfo(data)     ((data) -> mat_info)
 #define hypre_GpuMatDataSpMVBuffer(data)  ((data) -> spmv_buffer)
 
+/* device_utils.c, some common functions for CUDA, SYCL, HIP */
+
+dim3 hypre_GetDefaultDeviceBlockDimension();
+
+dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity,
+					  dim3 bDim );
+
+HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
+                                             HYPRE_Int *d_row_ind);
+
+HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
+
+HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
+
+HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
+                                             HYPRE_Int *d_row_ptr);
+
+HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i);
+
+HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
+
+HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
+
+template <typename T>
+HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
+                                                    HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind);
+
 #endif //#if defined(HYPRE_USING_GPU)
 
 #if defined(HYPRE_USING_SYCL)
 
 /* device_utils.c */
 HYPRE_Int HYPRE_SetSYCLDevice(sycl::device user_device);
-sycl::range<1> hypre_GetDefaultDeviceBlockDimension();
-
-sycl::range<1> hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity,
-                                                    sycl::range<1> bDim );
-
 #endif // #if defined(HYPRE_USING_SYCL)
 
 #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
@@ -386,15 +413,15 @@ using namespace thrust::placeholders;
 
 #if defined(HYPRE_DEBUG)
 #if defined(HYPRE_USING_CUDA)
-#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
+#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
 #elif defined(HYPRE_USING_HIP)
-#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() );  }
+#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() );  }
 #endif
 #else // #if defined(HYPRE_DEBUG)
 #define GPU_LAUNCH_SYNC
 #endif // defined(HYPRE_DEBUG)
 
-#define HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)                                                 \
+#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)                                                 \
 {                                                                                                                             \
    if ( gridsize.x  == 0 || gridsize.y  == 0 || gridsize.z  == 0 ||                                                           \
         blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 )                                                            \
@@ -410,7 +437,7 @@ using namespace thrust::placeholders;
    }                                                                                                                          \
 }
 
-#define HYPRE_CUDA_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
+#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
 
 /* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right
  * The following one works OK for now */
@@ -945,10 +972,6 @@ hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *val
 template <typename T1, typename T2, typename T3> HYPRE_Int hypreDevice_ReduceByTupleKey(HYPRE_Int N,
                                                                                         T1 *keys1_in,  T2 *keys2_in,  T3 *vals_in, T1 *keys1_out, T2 *keys2_out, T3 *vals_out);
 
-template <typename T>
-HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
-                                                    HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind);
-
 template <typename T>
 HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v);
 
@@ -960,22 +983,6 @@ HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices,
                                      HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j,
                                      HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab);
 
-HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i);
-
-HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
-
-HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i);
-
-HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr);
-
-HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr,
-                                             HYPRE_Int *d_row_ind);
-
-HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind);
-
-HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind,
-                                             HYPRE_Int *d_row_ptr);
-
 HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y,
                                     char *work);
 
@@ -1001,6 +1008,482 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data);
 
 #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
 
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+#if defined(HYPRE_USING_SYCL)
+
+#pragma once
+
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/iterator>
+#include <oneapi/dpl/functional>
+
+#include <dpct/dpl_extras/algorithm.h> // dpct::remove_if, remove_copy_if, copy_if, scatter_if
+
+#include <algorithm>
+#include <numeric>
+#include <functional>
+#include <iterator>
+
+#define __forceinline__ __inline__ __attribute__((always_inline))
+
+/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ * macro for launching SYCL kernels, SYCL, oneDPL, oneMKL calls
+ *                    NOTE: IN HYPRE'S DEFAULT STREAM
+ * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+ */
+
+template <typename InputIter1, typename InputIter2,
+          typename OutputIter>
+OutputIter hypreSycl_gather(InputIter1 map_first, InputIter1 map_last,
+                  InputIter2 input_first, OutputIter result) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<InputIter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<InputIter2>::iterator_category,
+              std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<OutputIter>::iterator_category,
+              std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto perm_begin =
+      oneapi::dpl::make_permutation_iterator(input_first, map_first);
+  const int n = ::std::distance(map_first, map_last);
+
+  return oneapi::dpl::copy(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())),
+			   perm_begin, perm_begin + n, result);
+}
+
+#if defined(HYPRE_DEBUG)
+#if defined(HYPRE_USING_CUDA)
+#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); }
+#endif
+#else // #if defined(HYPRE_DEBUG)
+#define GPU_LAUNCH_SYNC
+#endif // defined(HYPRE_DEBUG)
+
+#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...)                              \
+{                                                                                            \
+   if ( gridsize[0] == 0 || blocksize[0] == 0 )                                              \
+   {                                                                                         \
+     hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n",   \
+                  __FILE__, __LINE__,                                                        \
+                  gridsize[0], blocksize[0]);                                                \
+     assert(0); exit(1);                                                                     \
+   }                                                                                         \
+   else                                                                                      \
+   {                                                                                         \
+     hypre_HandleComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \
+        [=] (sycl::nd_item<1> item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]] {        \
+           (kernel_name)(item, __VA_ARGS__);                                                 \
+     });                                                                                     \
+   }                                                                                         \
+}
+
+/* RL: TODO Want macro HYPRE_ONEDPL_CALL to return value but I don't know how to do it right
+ * The following one works OK for now */
+
+#define HYPRE_ONEDPL_CALL(func_name, ...) \
+  func_name(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
+
+// /* return the number of threads in block */
+// template <hypre_int dim>
+// static __forceinline__
+// hypre_int hypre_gpu_get_num_threads()
+// {
+//    switch (dim)
+//    {
+//       case 1:
+//          return (blockDim.x);
+//       case 2:
+//          return (blockDim.x * blockDim.y);
+//       case 3:
+//          return (blockDim.x * blockDim.y * blockDim.z);
+//    }
+
+//    return -1;
+// }
+
+/* return the number of (sub_groups) warps in (work-group) block */
+template <hypre_int dim>
+static __forceinline__
+hypre_int hypre_gpu_get_num_warps(sycl::nd_item<dim>& item)
+{
+   return item.get_sub_group().get_group_range().get(0);
+}
+
+/* return the thread lane id in warp */
+template <hypre_int dim>
+static __forceinline__
+hypre_int hypre_gpu_get_lane_id(sycl::nd_item<dim>& item)
+{
+   return item.get_sub_group().get_local_linear_id();
+}
+
+// /* return the number of threads in grid */
+// template <hypre_int bdim, hypre_int gdim>
+// static __forceinline__
+// hypre_int hypre_gpu_get_grid_num_threads()
+// {
+//    return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_threads<bdim>();
+// }
+
+/* return the flattened work-item/thread id in global work space,
+ * Note: Since the use-cases always involved bdim = gdim = 1, the
+ * sycl:;nd_item<1> is only being used. SFINAE is used to prevent
+ * other dimensions (i.e., bdim != gdim != 1) */
+template < hypre_int bdim, hypre_int gdim >
+static __forceinline__
+hypre_int hypre_gpu_get_grid_thread_id(sycl::nd_item<1>& item)
+{
+   static_assert(bdim == 1 && gdim == 1);
+   return item.get_global_id(0);
+}
+
+// /* return the number of warps in grid */
+// template <hypre_int bdim, hypre_int gdim>
+// static __forceinline__
+// hypre_int hypre_gpu_get_grid_num_warps()
+// {
+//    return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_warps<bdim>();
+// }
+
+/* return the flattened warp id in grid */
+template <hypre_int bdim, hypre_int gdim>
+static __forceinline__
+hypre_int hypre_gpu_get_grid_warp_id(sycl::nd_item<1>& item)
+{
+  return item.get_group(0) * hypre_gpu_get_num_warps<bdim>(item) +
+     item.get_sub_group().get_group_linear_id();
+}
+
+// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+// static __forceinline__
+// hypre_double atomicAdd(hypre_double* address, hypre_double val)
+// {
+//     hypre_ulonglongint* address_as_ull = (hypre_ulonglongint*) address;
+//     hypre_ulonglongint old = *address_as_ull, assumed;
+
+//     do {
+//         assumed = old;
+//         old = atomicCAS(address_as_ull, assumed,
+//                         __double_as_longlong(val +
+//                                __longlong_as_double(assumed)));
+
+//     // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+//     } while (assumed != old);
+
+//     return __longlong_as_double(old);
+// }
+// #endif
+
+template <typename T>
+static __forceinline__
+T read_only_load( const T *ptr )
+{
+   return *ptr;
+}
+
+// /* exclusive prefix scan */
+// template <typename T>
+// static __forceinline__
+// T warp_prefix_sum(hypre_int lane_id, T in, T &all_sum)
+// {
+// #pragma unroll
+//    for (hypre_int d = 2; d <=HYPRE_WARP_SIZE; d <<= 1)
+//    {
+//       T t = __shfl_up_sync(HYPRE_WARP_FULL_MASK, in, d >> 1);
+//       if ( (lane_id & (d - 1)) == (d - 1) )
+//       {
+//          in += t;
+//       }
+//    }
+
+//    all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, HYPRE_WARP_SIZE-1);
+
+//    if (lane_id == HYPRE_WARP_SIZE-1)
+//    {
+//       in = 0;
+//    }
+
+// #pragma unroll
+//    for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//    {
+//       T t = __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);
+
+//       if ( (lane_id & (d - 1)) == (d - 1))
+//       {
+//         if ( (lane_id & ((d << 1) - 1)) == ((d << 1) - 1) )
+//          {
+//             in += t;
+//          }
+//          else
+//          {
+//             in = t;
+//          }
+//       }
+//    }
+//    return in;
+// }
+
+template <typename T, int DIM>
+static __forceinline__
+T warp_reduce_sum(T in, sycl::nd_item<DIM>& item)
+{
+  sycl::sub_group SG = item.get_sub_group();
+  //sycl::ext::oneapi::reduce(SG, in, std::plus<T>());
+#pragma unroll
+  for (hypre_int d = SG.get_local_range().get(0)/2; d > 0; d >>= 1)
+  {
+    in += SG.shuffle_down(in, d);
+  }
+  return in;  
+}
+
+// template <typename T>
+// static __forceinline__
+// T warp_allreduce_sum(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in += __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_reduce_max(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = max(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_allreduce_max(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = max(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_reduce_min(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = min(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// template <typename T>
+// static __forceinline__
+// T warp_allreduce_min(T in)
+// {
+// #pragma unroll
+//   for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1)
+//   {
+//     in = min(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
+//   }
+//   return in;
+// }
+
+// static __forceinline__
+// hypre_int next_power_of_2(hypre_int n)
+// {
+//    if (n <= 0)
+//    {
+//       return 0;
+//    }
+
+//    /* if n is power of 2, return itself */
+//    if ( (n & (n - 1)) == 0 )
+//    {
+//       return n;
+//    }
+
+//    n |= (n >>  1);
+//    n |= (n >>  2);
+//    n |= (n >>  4);
+//    n |= (n >>  8);
+//    n |= (n >> 16);
+//    n ^= (n >>  1);
+//    n  = (n <<  1);
+
+//    return n;
+// }
+
+// template<typename T>
+// struct absolute_value : public thrust::unary_function<T,T>
+// {
+//   T operator()(const T &x) const
+//   {
+//     return x < T(0) ? -x : x;
+//   }
+// };
+
+// template<typename T1, typename T2>
+// struct TupleComp2
+// {
+//    typedef thrust::tuple<T1, T2> Tuple;
+
+//    bool operator()(const Tuple& t1, const Tuple& t2)
+//    {
+//       if (thrust::get<0>(t1) < thrust::get<0>(t2))
+//       {
+//          return true;
+//       }
+//       if (thrust::get<0>(t1) > thrust::get<0>(t2))
+//       {
+//          return false;
+//       }
+//       return hypre_abs(thrust::get<1>(t1)) > hypre_abs(thrust::get<1>(t2));
+//    }
+// };
+
+// template<typename T1, typename T2>
+// struct TupleComp3
+// {
+//    typedef thrust::tuple<T1, T2> Tuple;
+
+//    bool operator()(const Tuple& t1, const Tuple& t2)
+//    {
+//       if (thrust::get<0>(t1) < thrust::get<0>(t2))
+//       {
+//          return true;
+//       }
+//       if (thrust::get<0>(t1) > thrust::get<0>(t2))
+//       {
+//          return false;
+//       }
+//       if (thrust::get<0>(t2) == thrust::get<1>(t2))
+//       {
+//          return false;
+//       }
+//       return thrust::get<0>(t1) == thrust::get<1>(t1) || thrust::get<1>(t1) < thrust::get<1>(t2);
+//    }
+// };
+
+// template<typename T>
+// struct is_negative : public thrust::unary_function<T,bool>
+// {
+//    bool operator()(const T &x)
+//    {
+//       return (x < 0);
+//    }
+// };
+
+// template<typename T>
+// struct is_positive : public thrust::unary_function<T,bool>
+// {
+//    bool operator()(const T &x)
+//    {
+//       return (x > 0);
+//    }
+// };
+
+// template<typename T>
+// struct is_nonnegative : public thrust::unary_function<T,bool>
+// {
+//    bool operator()(const T &x)
+//    {
+//       return (x >= 0);
+//    }
+// };
+
+template<typename T>
+struct in_range
+{
+   T low, up;
+   in_range(T low_, T up_) { low = low_; up = up_; }
+
+   bool operator()(const T &x) const { return (x >= low && x <= up); }
+};
+
+// template<typename T>
+// struct out_of_range : public thrust::unary_function<T,bool>
+// {
+//    T low, up;
+
+//    out_of_range(T low_, T up_) { low = low_; up = up_; }
+
+//    bool operator()(const T &x)
+//    {
+//       return (x < low || x > up);
+//    }
+// };
+
+#ifdef HYPRE_COMPLEX
+template<typename T,
+	 typename = typename std::enable_if<std::is_same<T, HYPRE_Complex>::value>::type>
+struct less_than
+{
+  T val;
+  less_than(T val_) { val = val_; }
+  bool operator()(const T &x) const { return (hypre_abs(x) < hypre_abs(val)); }
+};
+#else
+template<typename T,
+	 typename = typename std::enable_if<std::is_same<T, HYPRE_Real>::value>::type>
+struct less_than
+{
+  T val;
+  less_than(T val_) { val = val_; }
+  bool operator()(const T &x) const { return (x < val); }
+};
+#endif
+// template<typename T>
+// struct modulo : public thrust::unary_function<T,T>
+// {
+//    T val;
+
+//    modulo(T val_) { val = val_; }
+
+//    T operator()(const T &x)
+//    {
+//       return (x % val);
+//    }
+// };
+
+// template<typename T>
+// struct equal : public thrust::unary_function<T,bool>
+// {
+//    T val;
+
+//    equal(T val_) { val = val_; }
+
+//    bool operator()(const T &x)
+//    {
+//       return (x == val);
+//    }
+// };
+
+// struct print_functor
+// {
+//    void operator()(HYPRE_Real val)
+//    {
+//       printf("%f\n", val);
+//    }
+// };
+
+#endif // #if defined(HYPRE_USING_SYCL)
+
+////////////////////////////////////////////////////////////////////////////////////////
+
 #if defined(HYPRE_USING_CUSPARSE)
 
 cudaDataType hypre_HYPREComplexToCudaDataType();
diff --git a/src/utilities/general.c b/src/utilities/general.c
index c3f7da063f..cd2c49c18b 100644
--- a/src/utilities/general.c
+++ b/src/utilities/general.c
@@ -72,7 +72,13 @@ hypre_HandleDestroy(hypre_Handle *hypre_handle_)
    hypre_HandleDeviceData(hypre_handle_) = NULL;
 #endif
 
+// In debug mode, hypre_TFree() checks the pointer location, which requires the
+// hypre_handle_'s compute queue if using sycl. But this was just destroyed above.
+#if defined(HYPRE_DEBUG) && defined(HYPRE_USING_SYCL)
+   free(hypre_handle_);
+#else
    hypre_TFree(hypre_handle_, HYPRE_MEMORY_HOST);
+#endif
 
    return hypre_error_flag;
 }
@@ -94,7 +100,38 @@ hypre_SetDevice(hypre_int device_id, hypre_Handle *hypre_handle_)
    HYPRE_HIP_CALL( hipSetDevice(device_id) );
 #endif
 
-#if defined(HYPRE_USING_GPU)
+#if defined(HYPRE_USING_SYCL)
+   HYPRE_Int nDevices=0;
+   hypre_GetDeviceCount(&nDevices);
+   if (device_id > nDevices) {
+     hypre_printf("ERROR: SYCL device-ID exceed the number of devices on-node... \n");
+   }
+
+   sycl::platform platform(sycl::gpu_selector{});
+   auto gpu_devices = platform.get_devices(sycl::info::device_type::gpu);
+   HYPRE_Int local_nDevices=0;
+   for (int i = 0; i < gpu_devices.size(); i++) {
+     // multi-tile GPUs
+     if (gpu_devices[i].get_info<sycl::info::device::partition_max_sub_devices>() > 0) {
+       auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices<sycl::info::partition_property::partition_by_affinity_domain>(sycl::info::partition_affinity_domain::numa);
+       for (auto &tile : subDevicesDomainNuma) {
+         if (local_nDevices == device_id) {
+           hypre_HandleDevice(hypre_handle_) = &tile;
+         }
+         local_nDevices++;
+       }
+     }
+     // single-tile GPUs
+     else {
+       if (local_nDevices == device_id) {
+         hypre_HandleDevice(hypre_handle_) = &(gpu_devices[i]);
+       }
+       local_nDevices++;
+     }
+   }
+#endif
+
+#if defined(HYPRE_USING_GPU) && !defined(HYPRE_USING_SYCL)
    if (hypre_handle_)
    {
 #if defined(HYPRE_USING_SYCL)
@@ -417,6 +454,10 @@ HYPRE_PrintDeviceInfo()
    hypre_printf("Max Compute Units: %d\n", max_compute_units);
 #endif
 
+#if defined(HYPRE_USING_SYCL)
+  // WM: TODO
+#endif
+
    return hypre_error_flag;
 }
 
diff --git a/src/utilities/int_array.c b/src/utilities/int_array.c
index 7a51fbb80d..65ea3f5ef9 100644
--- a/src/utilities/int_array.c
+++ b/src/utilities/int_array.c
@@ -168,7 +168,7 @@ hypre_IntArraySetConstantValues( hypre_IntArray *v,
 #endif /* defined(HYPRE_USING_CUDA)  || defined(HYPRE_USING_HIP) */
 
 #if defined(HYPRE_USING_GPU)
-   hypre_SyncCudaComputeStream(hypre_handle());
+   hypre_SyncDeviceComputeStream(hypre_handle());
 #endif
 
    return ierr;
diff --git a/src/utilities/memory.h b/src/utilities/memory.h
index bd815020c1..6fcaa29a01 100644
--- a/src/utilities/memory.h
+++ b/src/utilities/memory.h
@@ -122,6 +122,11 @@ hypre_GetActualMemLocation(HYPRE_MemoryLocation location)
 #endif
    }
 
+   if (location == HYPRE_MEMORY_UNIFIED)
+   {
+      return hypre_MEMORY_UNIFIED;
+   }
+
    return hypre_MEMORY_UNDEFINED;
 }
 
diff --git a/src/utilities/protos.h b/src/utilities/protos.h
index eb41f99847..ad3b5ff8a8 100644
--- a/src/utilities/protos.h
+++ b/src/utilities/protos.h
@@ -269,8 +269,8 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR
                                            hypre_UnorderedBigIntMap *inverse_map);
 
 #if defined(HYPRE_USING_GPU)
-HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
-HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle);
+HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle);
+HYPRE_Int hypre_SyncDevice(hypre_Handle *hypre_handle);
 HYPRE_Int hypre_ResetCudaDevice(hypre_Handle *hypre_handle);
 HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data,
                                       HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y);
@@ -301,10 +301,10 @@ HYPRE_Int hypre_multmod(HYPRE_Int a, HYPRE_Int b, HYPRE_Int mod);
 void hypre_partition1D(HYPRE_Int n, HYPRE_Int p, HYPRE_Int j, HYPRE_Int *s, HYPRE_Int *e);
 char *hypre_strcpy(char *destination, const char *source);
 
-HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action);
-HYPRE_Int hypre_RestoreSyncCudaCompute();
-HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr);
-HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle);
+HYPRE_Int hypre_SetSyncDeviceCompute(HYPRE_Int action);
+HYPRE_Int hypre_RestoreSyncDeviceCompute();
+HYPRE_Int hypre_GetSyncDeviceCompute(HYPRE_Int *device_compute_stream_sync_ptr);
+HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle);
 
 /* handle.c */
 HYPRE_Int hypre_SetSpGemmUseCusparse( HYPRE_Int use_cusparse );