diff --git a/src/IJ_mv/IJMatrix_parcsr_device.c b/src/IJ_mv/IJMatrix_parcsr_device.c index 9a83b3da62..00f573553f 100644 --- a/src/IJ_mv/IJMatrix_parcsr_device.c +++ b/src/IJ_mv/IJMatrix_parcsr_device.c @@ -164,7 +164,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix, /* mark unwanted elements as -1 */ dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(len1, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator, (HYPRE_Int *) row_indexes, ncols, indicator ); auto new_end = HYPRE_THRUST_CALL( @@ -233,7 +233,7 @@ hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_Big /* dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(N0, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 ); */ /* output X: 0: keep, 1: zero-out */ diff --git a/src/IJ_mv/IJVector_parcsr_device.c b/src/IJ_mv/IJVector_parcsr_device.c index b34b1162f7..a26d19dd93 100644 --- a/src/IJ_mv/IJVector_parcsr_device.c +++ b/src/IJ_mv/IJVector_parcsr_device.c @@ -251,7 +251,7 @@ hypre_IJVectorAssembleParDevice(hypre_IJVector *vector) /* set/add to local vector */ dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(new_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i, vec_start, new_sora, hypre_VectorData(hypre_ParVectorLocalVector(par_vector)) ); diff --git a/src/config/configure.in b/src/config/configure.in index 06e6a22796..8edcabc68c 100644 --- a/src/config/configure.in +++ b/src/config/configure.in @@ -2316,7 +2316,7 @@ AS_IF([test x"$hypre_using_sycl" == x"yes"], if test "$hypre_user_chose_cuflags" = "no" then - CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel" + CUFLAGS="-D_GLIBCXX_USE_TBB_PAR_BACKEND=0 -fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel" if test "$hypre_using_debug" = "yes" then CUFLAGS="-O0 -Wall -g ${CUFLAGS}" diff --git a/src/configure b/src/configure index 7993465afb..66d6707f63 100755 --- a/src/configure +++ b/src/configure @@ -9143,7 +9143,7 @@ $as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h if test "$hypre_user_chose_cuflags" = "no" then - CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel" + CUFLAGS="-D_GLIBCXX_USE_TBB_PAR_BACKEND=0 -fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel" if test "$hypre_using_debug" = "yes" then CUFLAGS="-O0 -Wall -g ${CUFLAGS}" diff --git a/src/distributed_matrix/distributed_matrix_parcsr.c b/src/distributed_matrix/distributed_matrix_parcsr.c index 0df9ae59e8..e6d986dddb 100644 --- a/src/distributed_matrix/distributed_matrix_parcsr.c +++ b/src/distributed_matrix/distributed_matrix_parcsr.c @@ -102,7 +102,7 @@ hypre_DistributedMatrixGetRowParCSR( hypre_DistributedMatrix *matrix, // RL: if HYPRE_ParCSRMatrixGetRow was on device, need the next line to guarantee it's done #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif return(ierr); diff --git a/src/parcsr_ls/ads.c b/src/parcsr_ls/ads.c index 03c3fccb3d..9288e60b29 100644 --- a/src/parcsr_ls/ads.c +++ b/src/parcsr_ls/ads.c @@ -627,12 +627,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, F2V_diag_nnz, 3, F2V_diag_J, Pi_diag_J ); gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data, Pi_diag_data ); } @@ -693,12 +693,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, F2V_offd_nnz, 3, F2V_offd_J, Pi_offd_J ); gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data, Pi_offd_data ); } @@ -907,7 +907,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data, Pix_diag_data, Piy_diag_data, Piz_diag_data ); } @@ -987,7 +987,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data, Pix_offd_data, Piy_offd_data, Piz_offd_data ); } diff --git a/src/parcsr_ls/ame.c b/src/parcsr_ls/ame.c index 1f4de312c3..f68266d5a8 100644 --- a/src/parcsr_ls/ame.c +++ b/src/parcsr_ls/ame.c @@ -496,7 +496,7 @@ HYPRE_Int hypre_AMESetup(void *esolver) { dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim, nv, GtdI, GtdJ, GtdA, GtoI, GtoJ, GtoA, edge_bc, offd_edge_bc ); } else diff --git a/src/parcsr_ls/ams.c b/src/parcsr_ls/ams.c index c1d43292a5..9a90c0a71c 100644 --- a/src/parcsr_ls/ams.c +++ b/src/parcsr_ls/ams.c @@ -194,7 +194,7 @@ HYPRE_Int hypre_ParVectorBlockSplit(hypre_ParVector *x, { dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim, size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data); } else @@ -241,7 +241,7 @@ HYPRE_Int hypre_ParVectorBlockGather(hypre_ParVector *x, { dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim, size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data); } else @@ -456,10 +456,10 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A) bDim = hypre_GetDefaultDeviceBlockDimension(); gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim, + HYPRE_GPU_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim, nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd); - //hypre_SyncCudaComputeStream(hypre_handle()); + //hypre_SyncDeviceComputeStream(hypre_handle()); return hypre_error_flag; } @@ -787,7 +787,7 @@ HYPRE_Int hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix *A, HYPRE_Real d) { dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim, num_rows, A_diag_I, A_diag_J, A_diag_data, A_offd_I, num_cols_offd, d); } else @@ -1623,12 +1623,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, G_diag_nnz, dim, G_diag_J, Pi_diag_J ); gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data, Pi_diag_data ); } @@ -1696,12 +1696,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, G_offd_nnz, dim, G_offd_J, Pi_offd_J ); gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data, Pi_offd_data ); } @@ -1944,7 +1944,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data, Pix_diag_data, Piy_diag_data, Piz_diag_data ); } @@ -2010,7 +2010,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, NULL, Pix_diag_data, Piy_diag_data, NULL ); } @@ -2068,7 +2068,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, NULL, NULL, Pix_diag_data, NULL, NULL ); } @@ -2145,7 +2145,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data, Pix_offd_data, Piy_offd_data, Piz_offd_data ); } @@ -2227,7 +2227,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, NULL, Pix_offd_data, Piy_offd_data, NULL ); } @@ -2299,7 +2299,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, NULL, NULL, Pix_offd_data, NULL, NULL ); } @@ -2501,12 +2501,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, G_diag_nnz, dim, G_diag_J, GPi_diag_J ); gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim, G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data, GPi_diag_data ); } @@ -2575,12 +2575,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim, G_offd_nnz, dim, G_offd_J, GPi_offd_J ); gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim, G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data, GPi_offd_data ); } @@ -2815,7 +2815,7 @@ HYPRE_Int hypre_AMSSetup(void *solver, { dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim, nv, G0tdI, G0tdA, G0toI, G0toA, interior_nodes_data ); } else @@ -3401,7 +3401,7 @@ HYPRE_Int hypre_AMSSetup(void *solver, { dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(Gt_num_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim, Gt_num_rows, Gt_diag_I, Gt_diag_J, Gt_diag_data, Gt_offd_I, Gt_offd_data, Gx_data, Gy_data, Gz_data ); } diff --git a/src/parcsr_ls/par_2s_interp_device.c b/src/parcsr_ls/par_2s_interp_device.c index 94156f2e8b..5ad3f91c46 100644 --- a/src/parcsr_ls/par_2s_interp_device.c +++ b/src/parcsr_ls/par_2s_interp_device.c @@ -109,7 +109,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix *A, dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim); /* only for rows corresponding to F2 (notice flag == -1) */ - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, A_nr_local, A_offd_nnz > 0, @@ -160,7 +160,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix *A, * diagnoally scale As_F2F (from both sides) and replace the diagonal */ gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF, gDim, bDim, AF2F_nr_local, hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)), @@ -329,7 +329,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A, dlam = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE); dtmp = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp, gDim, bDim, AFC_nr_local, hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_FF)), @@ -388,7 +388,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A, gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim); /* only for rows corresponding to F2 (notice flag == -1) */ - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, A_nr_local, A_offd_nnz > 0, @@ -438,7 +438,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A, * diagnoally scale As_F2F (from both sides) and replace the diagonal */ gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF, gDim, bDim, AF2F_nr_local, hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)), diff --git a/src/parcsr_ls/par_coarsen_device.c b/src/parcsr_ls/par_coarsen_device.c index 70749d9abc..a2c9fb47e2 100644 --- a/src/parcsr_ls/par_coarsen_device.c +++ b/src/parcsr_ls/par_coarsen_device.c @@ -331,7 +331,7 @@ hypre_PMISCoarseningInitDevice( hypre_ParCSRMatrix *S, /* in */ HYPRE_Int *new_end; /* init CF_marker_diag and measure_diag: remove some special nodes */ - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim, num_rows_diag, CF_init, S_diag_i, S_offd_i, measure_diag, CF_marker_diag ); /* communicate for measure_offd */ @@ -494,7 +494,7 @@ hypre_PMISCoarseningUpdateCFDevice( hypre_ParCSRMatrix *S, /* in bDim = hypre_GetDefaultDeviceBlockDimension(); gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF, gDim, bDim, graph_diag_size, graph_diag, diff --git a/src/parcsr_ls/par_gauss_elim.c b/src/parcsr_ls/par_gauss_elim.c index dcce956b40..85010edbaa 100644 --- a/src/parcsr_ls/par_gauss_elim.c +++ b/src/parcsr_ls/par_gauss_elim.c @@ -424,7 +424,7 @@ HYPRE_Int hypre_dgemv_device(HYPRE_Int m, HYPRE_Int n, HYPRE_Int lda, HYPRE_Real dim3 bDim(BLOCK_SIZE, 1, 1); dim3 gDim = hypre_GetDefaultDeviceGridDimension(m, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y ); return hypre_error_flag; } diff --git a/src/parcsr_ls/par_indepset_device.c b/src/parcsr_ls/par_indepset_device.c index 3d1d9c60c1..8e40e2c9f6 100644 --- a/src/parcsr_ls/par_indepset_device.c +++ b/src/parcsr_ls/par_indepset_device.c @@ -170,7 +170,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix *S, bDim = hypre_GetDefaultDeviceBlockDimension(); gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IndepSetMain, gDim, bDim, graph_diag_size, graph_diag, measure_diag, measure_offd, S_diag_i, S_diag_j, S_offd_i, S_offd_j, IS_marker_diag, IS_marker_offd, IS_offd_temp_mark ); @@ -186,7 +186,7 @@ hypre_BoomerAMGIndepSetDevice( hypre_ParCSRMatrix *S, /* adjust IS_marker_diag from the received */ gDim = hypre_GetDefaultDeviceGridDimension(num_elmts_send, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IndepSetFixMarker, gDim, bDim, IS_marker_diag, num_elmts_send, send_map_elmts, int_send_buf, IS_offd_temp_mark ); diff --git a/src/parcsr_ls/par_interp_device.c b/src/parcsr_ls/par_interp_device.c index 83139d52ac..3dfac1dca9 100644 --- a/src/parcsr_ls/par_interp_device.c +++ b/src/parcsr_ls/par_interp_device.c @@ -197,7 +197,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getnnz, gDim, bDim, n_fine, S_diag_i, S_diag_j, S_offd_i, S_offd_j, CF_marker, CF_marker_offd, num_functions, dof_func_dev, dof_func_offd, P_diag_i, P_offd_i); @@ -228,7 +228,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A, if (interp_type == 3) { - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef, gDim, bDim, n_fine, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_j, A_offd_data, hypre_ParCSRMatrixSocDiagJ(S), @@ -241,7 +241,7 @@ hypre_BoomerAMGBuildDirInterpDevice( hypre_ParCSRMatrix *A, } else { - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildDirInterp_getcoef_v2, gDim, bDim, n_fine, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_j, A_offd_data, hypre_ParCSRMatrixSocDiagJ(S), @@ -1161,7 +1161,7 @@ hypre_BoomerAMGBuildInterpOnePntDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildInterpOnePnt_getnnz, gDim, bDim, n_fine, A_diag_i, A_strong_diag_j, A_diag_a, A_offd_i, A_strong_offd_j, A_offd_a, CF_marker, CF_marker_offd, diag_compress_marker, offd_compress_marker, P_diag_i, P_diag_j_temp, P_offd_i, P_offd_j_temp); diff --git a/src/parcsr_ls/par_interp_trunc_device.c b/src/parcsr_ls/par_interp_trunc_device.c index 2deaf29eff..f73270e4c8 100644 --- a/src/parcsr_ls/par_interp_trunc_device.c +++ b/src/parcsr_ls/par_interp_trunc_device.c @@ -162,7 +162,7 @@ hypre_BoomerAMGInterpTruncationDevice( hypre_ParCSRMatrix *P, HYPRE_Real trunc_f dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_InterpTruncation, gDim, bDim, nrows, trunc_factor, max_elmts, P_rowptr, P_j, P_a ); /* build new P_diag and P_offd */ diff --git a/src/parcsr_ls/par_lr_interp_device.c b/src/parcsr_ls/par_lr_interp_device.c index e21d3e8cdb..23a4d723af 100644 --- a/src/parcsr_ls/par_lr_interp_device.c +++ b/src/parcsr_ls/par_lr_interp_device.c @@ -87,7 +87,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, A_nr_of_rows, A_offd_nnz > 0, @@ -128,7 +128,7 @@ hypre_BoomerAMGBuildExtInterpDevice(hypre_ParCSRMatrix *A, /* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */ hypre_GpuProfilingPushRange("Compute interp matrix"); gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_aff_afc, gDim, bDim, W_nr_of_rows, hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)), @@ -273,7 +273,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, A_nr_of_rows, A_offd_nnz > 0, @@ -352,7 +352,7 @@ hypre_BoomerAMGBuildExtPIInterpDevice( hypre_ParCSRMatrix *A, hypre_GpuProfilingPushRange("Compute interp matrix"); gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_twiaff_w, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_twiaff_w, gDim, bDim, W_nr_of_rows, hypre_ParCSRMatrixFirstRowIndex(AFF), @@ -502,7 +502,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums, gDim, bDim, A_nr_of_rows, A_offd_nnz > 0, @@ -545,7 +545,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A, dtmp = hypre_TAlloc(HYPRE_Complex, W_nr_of_rows, HYPRE_MEMORY_DEVICE); hypre_GpuProfilingPushRange("Compute D_tmp"); gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp, gDim, bDim, W_nr_of_rows, hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)), @@ -587,7 +587,7 @@ hypre_BoomerAMGBuildExtPEInterpDevice(hypre_ParCSRMatrix *A, /* 6. Form matrix ~{A_FC}, (return twAFC in AFC data structure) */ hypre_GpuProfilingPushRange("Compute interp matrix"); gDim = hypre_GetDefaultDeviceGridDimension(W_nr_of_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_aff_afc_epe, gDim, bDim, W_nr_of_rows, hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(AFF)), diff --git a/src/parcsr_ls/par_lr_restr_device.c b/src/parcsr_ls/par_lr_restr_device.c index 18bf10fc88..97f3b8be9b 100644 --- a/src/parcsr_ls/par_lr_restr_device.c +++ b/src/parcsr_ls/par_lr_restr_device.c @@ -254,7 +254,7 @@ hypre_BoomerAMGBuildRestrNeumannAIRDevice( hypre_ParCSRMatrix *A, /* assemble the diagonal part of R from Z */ dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGBuildRestrNeumannAIR_assembleRdiag, gDim, bDim, n_cpts, Fmap, Cmap, Z_diag_i, Z_diag_j, Z_diag_a, R_diag_i, R_diag_j, R_diag_a); num_cols_offd_R = num_cols_offd_Z; diff --git a/src/parcsr_ls/par_mod_multi_interp_device.c b/src/parcsr_ls/par_mod_multi_interp_device.c index 5aea7a00d3..36d20ab022 100644 --- a/src/parcsr_ls/par_mod_multi_interp_device.c +++ b/src/parcsr_ls/par_mod_multi_interp_device.c @@ -343,7 +343,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A, dim3 gDim = hypre_GetDefaultDeviceGridDimension(remaining, "warp", bDim); /* output diag_shifts is 0/1 indicating if points_left_dev[i] is picked in this pass */ - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_pass_order_count, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_pass_order_count, gDim, bDim, remaining, current_pass, @@ -438,7 +438,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n_fine, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_cfmarker_masked_rowsum, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_cfmarker_masked_rowsum, gDim, bDim, n_fine, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_j, A_offd_data, CF_marker, @@ -591,7 +591,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_insert_remaining_weights, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_insert_remaining_weights, gDim, bDim, pass_starts[p + 1], pass_starts[p + 2], pass_order, Pi_diag_i, Pi_diag_j, Pi_diag_data, P_diag_i, P_diag_j, P_diag_data, @@ -654,7 +654,7 @@ hypre_BoomerAMGBuildModMultipassDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(npoints, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_populate_big_P_offd_j, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_populate_big_P_offd_j, gDim, bDim, pass_starts[p + 1], pass_starts[p + 2], pass_order, @@ -893,7 +893,7 @@ hypre_GenerateMultipassPiDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim, num_points, color, pass_order, pass_marker, pass_marker_offd, S_diag_i, S_diag_j, S_offd_i, S_offd_j, P_diag_i, P_offd_i ); @@ -921,7 +921,7 @@ hypre_GenerateMultipassPiDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_j_Poffd_j, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_j_Poffd_j, gDim, bDim, num_points, color, pass_order, @@ -1144,7 +1144,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Pdiag_i_Poffd_i, gDim, bDim, num_points, color, pass_order, pass_marker, pass_marker_offd, S_diag_i, S_diag_j, S_offd_i, S_offd_j, Q_diag_i, Q_offd_i ); @@ -1173,7 +1173,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_generate_Qdiag_j_Qoffd_j, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_generate_Qdiag_j_Qoffd_j, gDim, bDim, num_points, color, pass_order, @@ -1244,7 +1244,7 @@ hypre_GenerateMultiPiDevice( hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_points, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_mutli_pi_rowsum, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_mutli_pi_rowsum, gDim, bDim, num_points, pass_order, A_diag_i, A_diag_data, Pi_diag_i, Pi_diag_data, Pi_offd_i, Pi_offd_data, w_row_sum ); diff --git a/src/parcsr_ls/par_relax.c b/src/parcsr_ls/par_relax.c index 608bc4209d..63d6b7df03 100644 --- a/src/parcsr_ls/par_relax.c +++ b/src/parcsr_ls/par_relax.c @@ -1117,8 +1117,8 @@ hypre_BoomerAMGRelax7Jacobi( hypre_ParCSRMatrix *A, #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) HYPRE_Int sync_stream; - hypre_GetSyncCudaCompute(&sync_stream); - hypre_SetSyncCudaCompute(0); + hypre_GetSyncDeviceCompute(&sync_stream); + hypre_SetSyncDeviceCompute(0); #endif /*----------------------------------------------------------------- @@ -1144,8 +1144,8 @@ hypre_BoomerAMGRelax7Jacobi( hypre_ParCSRMatrix *A, } #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - hypre_SetSyncCudaCompute(sync_stream); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SetSyncDeviceCompute(sync_stream); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif return hypre_error_flag; diff --git a/src/parcsr_ls/par_relax_more_device.c b/src/parcsr_ls/par_relax_more_device.c index 00f6f639cf..3388da1f82 100644 --- a/src/parcsr_ls/par_relax_more_device.c +++ b/src/parcsr_ls/par_relax_more_device.c @@ -155,7 +155,7 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A, bDim = hypre_GetDefaultDeviceBlockDimension(); gDim = hypre_GetDefaultDeviceGridDimension(A_num_rows, "warp", bDim); - HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate, + HYPRE_GPU_LAUNCH(hypreCUDAKernel_CSRMaxEigEstimate, gDim, bDim, A_num_rows, @@ -169,7 +169,7 @@ hypre_ParCSRMaxEigEstimateDevice( hypre_ParCSRMatrix *A, rowsums_upper, scale); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); e_min = HYPRE_THRUST_CALL(reduce, rowsums_lower, rowsums_lower + A_num_rows, (HYPRE_Real)0, thrust::minimum()); @@ -323,7 +323,7 @@ hypre_ParCSRMaxEigEstimateCGDevice(hypre_ParCSRMatrix *A, /* matrix to relax /* set residual to random */ hypre_CurandUniform(local_size, r_data, 0, 0, 0, 0); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_THRUST_CALL(transform, r_data, r_data + local_size, r_data, diff --git a/src/parcsr_ls/par_strength_device.c b/src/parcsr_ls/par_strength_device.c index af6d9b0ad2..3f884ee92b 100644 --- a/src/parcsr_ls/par_strength_device.c +++ b/src/parcsr_ls/par_strength_device.c @@ -140,7 +140,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix *A, if (abs_soc) { - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGCreateSabs_rowcount, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGCreateSabs_rowcount, gDim, bDim, num_variables, max_row_sum, strength_threshold, A_diag_data, A_diag_i, A_diag_j, A_offd_data, A_offd_i, A_offd_j, @@ -150,7 +150,7 @@ hypre_BoomerAMGCreateSDevice(hypre_ParCSRMatrix *A, } else { - HYPRE_CUDA_LAUNCH( hypre_BoomerAMGCreateS_rowcount, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_BoomerAMGCreateS_rowcount, gDim, bDim, num_variables, max_row_sum, strength_threshold, A_diag_data, A_diag_i, A_diag_j, A_offd_data, A_offd_i, A_offd_j, diff --git a/src/parcsr_mv/par_csr_communication.c b/src/parcsr_mv/par_csr_communication.c index 35fef28c8d..9786d21d31 100644 --- a/src/parcsr_mv/par_csr_communication.c +++ b/src/parcsr_mv/par_csr_communication.c @@ -434,7 +434,7 @@ hypre_ParCSRCommHandleCreate_v2 ( HYPRE_Int job, recv_data = recv_data_in; // TODO RL: it seems that we need to sync the CUDA stream before doing GPU-GPU MPI. // Need to check MPI documentation whether this is acutally true - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif num_requests = num_sends + num_recvs; diff --git a/src/parcsr_mv/par_csr_matop.c b/src/parcsr_mv/par_csr_matop.c index 8eeb6dcf4c..97552f4aa1 100644 --- a/src/parcsr_mv/par_csr_matop.c +++ b/src/parcsr_mv/par_csr_matop.c @@ -4113,7 +4113,7 @@ hypre_ParTMatmul( hypre_ParCSRMatrix *A, if ( hypre_GetExecPolicy2(memory_location_A, memory_location_B) == HYPRE_EXEC_DEVICE ) { hypre_CSRMatrixMoveDiagFirstDevice(hypre_ParCSRMatrixDiag(C)); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); } #endif diff --git a/src/parcsr_mv/par_csr_matop_device.c b/src/parcsr_mv/par_csr_matop_device.c index 9387a863f8..992dea4964 100644 --- a/src/parcsr_mv/par_csr_matop_device.c +++ b/src/parcsr_mv/par_csr_matop_device.c @@ -306,7 +306,7 @@ hypre_MergeDiagAndOffdDevice(hypre_ParCSRMatrix *A) hypre_CSRMatrixData(B) = B_a; hypre_CSRMatrixMemoryLocation(B) = HYPRE_MEMORY_DEVICE; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return B; } @@ -628,7 +628,7 @@ hypre_ConcatDiagAndOffdDevice(hypre_ParCSRMatrix *A) const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); const dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A_diag), "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, gDim, bDim, hypre_CSRMatrixNumRows(A_diag), hypre_CSRMatrixNumCols(A_diag), @@ -745,7 +745,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_ParCSRMatrixNumRows(A), "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, gDim, bDim, hypre_CSRMatrixNumRows(A_diag), hypre_CSRMatrixNumCols(A_diag), @@ -777,7 +777,7 @@ hypre_ConcatDiagOffdAndExtDevice(hypre_ParCSRMatrix *A, hypre_assert(hypre_CSRMatrixNumCols(E_diag) == hypre_CSRMatrixNumCols(A_diag)); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ConcatDiagAndOffd, gDim, bDim, hypre_CSRMatrixNumRows(E_diag), hypre_CSRMatrixNumCols(E_diag), @@ -1044,7 +1044,7 @@ hypre_ParCSRMatrixGetRowDevice( hypre_ParCSRMatrix *mat, *values = hypre_ParCSRMatrixRowvalues(mat); } - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return hypre_error_flag; } @@ -1211,21 +1211,21 @@ hypre_ParCSRMatrixDropSmallEntriesDevice( hypre_ParCSRMatrix *A, if (type == -1) { - HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols < -1 >, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols < -1 >, gDim, bDim, hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag), hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd), hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd); } if (type == 1) { - HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<1>, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<1>, gDim, bDim, hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag), hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd), hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd); } if (type == 2) { - HYPRE_CUDA_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<2>, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_ParCSRMatrixDropSmallEntriesDevice_getElmtTols<2>, gDim, bDim, hypre_CSRMatrixNumRows(A_diag), tol, hypre_CSRMatrixI(A_diag), hypre_CSRMatrixJ(A_diag), hypre_CSRMatrixData(A_diag), hypre_CSRMatrixI(A_offd), hypre_CSRMatrixData(A_offd), elmt_tols_diag, elmt_tols_offd); @@ -1603,7 +1603,7 @@ hypre_ParCSRDiagScale( HYPRE_ParCSRMatrix HA, HYPRE_Int ierr = 0; #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypreDevice_DiagScaleVector(local_size, A_i, A_data, y_data, 0.0, x_data); - //hypre_SyncCudaComputeStream(hypre_handle()); + //hypre_SyncDeviceComputeStream(hypre_handle()); #else /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ HYPRE_Int i; #if defined(HYPRE_USING_DEVICE_OPENMP) diff --git a/src/parcsr_mv/par_csr_matvec.c b/src/parcsr_mv/par_csr_matvec.c index 30921fe960..d53f74a9d8 100644 --- a/src/parcsr_mv/par_csr_matvec.c +++ b/src/parcsr_mv/par_csr_matvec.c @@ -56,8 +56,8 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, #if defined(HYPRE_USING_GPU) HYPRE_Int sync_stream; - hypre_GetSyncCudaCompute(&sync_stream); - hypre_SetSyncCudaCompute(0); + hypre_GetSyncDeviceCompute(&sync_stream); + hypre_SetSyncDeviceCompute(0); #endif HYPRE_ANNOTATE_FUNC_BEGIN; @@ -348,8 +348,8 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, } #if defined(HYPRE_USING_GPU) - hypre_SetSyncCudaCompute(sync_stream); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SetSyncDeviceCompute(sync_stream); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -415,8 +415,8 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha, #if defined(HYPRE_USING_GPU) HYPRE_Int sync_stream; - hypre_GetSyncCudaCompute(&sync_stream); - hypre_SetSyncCudaCompute(0); + hypre_GetSyncDeviceCompute(&sync_stream); + hypre_SetSyncDeviceCompute(0); #endif HYPRE_ANNOTATE_FUNC_BEGIN; @@ -724,8 +724,8 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha, } #if defined(HYPRE_USING_GPU) - hypre_SetSyncCudaCompute(sync_stream); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SetSyncDeviceCompute(sync_stream); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE diff --git a/src/parcsr_mv/par_csr_triplemat_device.c b/src/parcsr_mv/par_csr_triplemat_device.c index 0b8a67fd63..5c77572e04 100644 --- a/src/parcsr_mv/par_csr_triplemat_device.c +++ b/src/parcsr_mv/par_csr_triplemat_device.c @@ -497,7 +497,7 @@ hypre_ParCSRTMatMatKTDevice( hypre_ParCSRMatrix *A, hypre_assert(!hypre_CSRMatrixCheckDiagFirstDevice(hypre_ParCSRMatrixDiag(C))); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return C; } @@ -817,7 +817,7 @@ hypre_ParCSRMatrixRAPKTDevice( hypre_ParCSRMatrix *R, hypre_assert(!hypre_CSRMatrixCheckDiagFirstDevice(hypre_ParCSRMatrixDiag(C))); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); return C; } diff --git a/src/seq_mv/csr_matop_device.c b/src/seq_mv/csr_matop_device.c index 5f56789ae4..bacc0b28fe 100644 --- a/src/seq_mv/csr_matop_device.c +++ b/src/seq_mv/csr_matop_device.c @@ -110,73 +110,470 @@ hypre_GpuMatDataDestroy(hypre_GpuMatData *data) #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) -hypre_CSRMatrix* -hypre_CSRMatrixAddDevice ( HYPRE_Complex alpha, - hypre_CSRMatrix *A, - HYPRE_Complex beta, - hypre_CSRMatrix *B ) +HYPRE_Int +hypre_CSRMatrixSplitDevice_core( HYPRE_Int job, /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */ + HYPRE_Int num_rows, + HYPRE_Int B_ext_nnz, + HYPRE_Int *B_ext_ii, /* Note: this is NOT row pointers as in CSR but row indices as in COO */ + HYPRE_BigInt *B_ext_bigj, /* Note: [BigInt] global column indices */ + HYPRE_Complex *B_ext_data, + char *B_ext_xata, /* companion data with B_ext_data; NULL if none */ + HYPRE_BigInt first_col_diag_B, + HYPRE_BigInt last_col_diag_B, + HYPRE_Int num_cols_offd_B, + HYPRE_BigInt *col_map_offd_B, + HYPRE_Int **map_B_to_C_ptr, + HYPRE_Int *num_cols_offd_C_ptr, + HYPRE_BigInt **col_map_offd_C_ptr, + HYPRE_Int *B_ext_diag_nnz_ptr, + HYPRE_Int *B_ext_diag_ii, /* memory allocated outside */ + HYPRE_Int *B_ext_diag_j, + HYPRE_Complex *B_ext_diag_data, + char *B_ext_diag_xata, /* companion with B_ext_diag_data_ptr; NULL if none */ + HYPRE_Int *B_ext_offd_nnz_ptr, + HYPRE_Int *B_ext_offd_ii, /* memory allocated outside */ + HYPRE_Int *B_ext_offd_j, + HYPRE_Complex *B_ext_offd_data, + char *B_ext_offd_xata /* companion with B_ext_offd_data_ptr; NULL if none */ ) { - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Int nrows_A = hypre_CSRMatrixNumRows(A); - HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); - HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Complex *B_data = hypre_CSRMatrixData(B); - HYPRE_Int *B_i = hypre_CSRMatrixI(B); - HYPRE_Int *B_j = hypre_CSRMatrixJ(B); - HYPRE_Int nrows_B = hypre_CSRMatrixNumRows(B); - HYPRE_Int ncols_B = hypre_CSRMatrixNumCols(B); - HYPRE_Int nnz_B = hypre_CSRMatrixNumNonzeros(B); - HYPRE_Complex *C_data; - HYPRE_Int *C_i; - HYPRE_Int *C_j; - HYPRE_Int nnzC; - hypre_CSRMatrix *C; - - if (nrows_A != nrows_B || ncols_A != ncols_B) - { - hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! Incompatible matrix dimensions!\n"); + HYPRE_Int B_ext_diag_nnz; + HYPRE_Int B_ext_offd_nnz; + HYPRE_BigInt *B_ext_diag_bigj = NULL; + HYPRE_BigInt *B_ext_offd_bigj = NULL; + HYPRE_BigInt *col_map_offd_C; + HYPRE_Int *map_B_to_C = NULL; + HYPRE_Int num_cols_offd_C; + + in_range pred1(first_col_diag_B, last_col_diag_B); + + /* get diag and offd nnz */ + if (job == 0) + { + /* query the nnz's */ + B_ext_diag_nnz = HYPRE_THRUST_CALL( count_if, + B_ext_bigj, + B_ext_bigj + B_ext_nnz, + pred1 ); + B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz; + + *B_ext_diag_nnz_ptr = B_ext_diag_nnz; + *B_ext_offd_nnz_ptr = B_ext_offd_nnz; + + return hypre_error_flag; + } + else + { + B_ext_diag_nnz = *B_ext_diag_nnz_ptr; + B_ext_offd_nnz = *B_ext_offd_nnz_ptr; + } + + /* copy to diag */ + B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + + if (B_ext_diag_xata) + { + auto new_end = HYPRE_THRUST_CALL( + copy_if, + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata)), /* first */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata)) + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata)), /* result */ + pred1 ); + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz ); + } + else + { + auto new_end = HYPRE_THRUST_CALL( + copy_if, + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data)), /* first */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data)) + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data)), /* result */ + pred1 ); + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz ); + } + + HYPRE_THRUST_CALL( transform, + B_ext_diag_bigj, + B_ext_diag_bigj + B_ext_diag_nnz, + thrust::make_constant_iterator(first_col_diag_B), + B_ext_diag_j, + thrust::minus()); + + hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE); + + /* copy to offd */ + B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); + + if (B_ext_offd_xata) + { + auto new_end = HYPRE_THRUST_CALL( + copy_if, + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata)), /* first */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata)) + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata)), /* result */ + thrust::not1(pred1) ); + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); + } + else + { + auto new_end = HYPRE_THRUST_CALL( + copy_if, + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data)), /* first */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data)) + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data)), /* result */ + thrust::not1(pred1) ); + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); + } + + /* offd map of B_ext_offd Union col_map_offd_B */ + col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(col_map_offd_C, B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B, HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + HYPRE_THRUST_CALL( sort, + col_map_offd_C, + col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); + + HYPRE_BigInt *new_end = HYPRE_THRUST_CALL( unique, + col_map_offd_C, + col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); + + num_cols_offd_C = new_end - col_map_offd_C; - return NULL; - } +#if 1 + HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE); + col_map_offd_C = tmp; +#else + col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE); +#endif + + /* create map from col_map_offd_B */ + if (num_cols_offd_B) + { + map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE); + HYPRE_THRUST_CALL( lower_bound, + col_map_offd_C, + col_map_offd_C + num_cols_offd_C, + col_map_offd_B, + col_map_offd_B + num_cols_offd_B, + map_B_to_C ); + } + + HYPRE_THRUST_CALL( lower_bound, + col_map_offd_C, + col_map_offd_C + num_cols_offd_C, + B_ext_offd_bigj, + B_ext_offd_bigj + B_ext_offd_nnz, + B_ext_offd_j ); + + hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE); + + if (map_B_to_C_ptr) + { + *map_B_to_C_ptr = map_B_to_C; + } + *num_cols_offd_C_ptr = num_cols_offd_C; + *col_map_offd_C_ptr = col_map_offd_C; + + return hypre_error_flag; +} + +typedef thrust::tuple Int2; +struct Int2Unequal : public thrust::unary_function +{ + __host__ __device__ + bool operator()(const Int2& t) const + { + return (thrust::get<0>(t) != thrust::get<1>(t)); + } +}; + +/* this predicate compares first and second element in a tuple in absolute value */ +/* first is assumed to be complex, second to be real > 0 */ +struct cabsfirst_greaterthan_second_pred : public thrust::unary_function,bool> +{ + __host__ __device__ + bool operator()(const thrust::tuple& t) const + { + const HYPRE_Complex i = thrust::get<0>(t); + const HYPRE_Real j = thrust::get<1>(t); + + return hypre_cabs(i) > j; + } +}; + +#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */ + +#if defined(HYPRE_USING_SYCL) + +HYPRE_Int +hypre_CSRMatrixSplitDevice_core( HYPRE_Int job, /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */ + HYPRE_Int num_rows, + HYPRE_Int B_ext_nnz, + HYPRE_Int *B_ext_ii, /* Note: this is NOT row pointers as in CSR but row indices as in COO */ + HYPRE_BigInt *B_ext_bigj, /* Note: [BigInt] global column indices */ + HYPRE_Complex *B_ext_data, + char *B_ext_xata, /* companion data with B_ext_data; NULL if none */ + HYPRE_BigInt first_col_diag_B, + HYPRE_BigInt last_col_diag_B, + HYPRE_Int num_cols_offd_B, + HYPRE_BigInt *col_map_offd_B, + HYPRE_Int **map_B_to_C_ptr, + HYPRE_Int *num_cols_offd_C_ptr, + HYPRE_BigInt **col_map_offd_C_ptr, + HYPRE_Int *B_ext_diag_nnz_ptr, + HYPRE_Int *B_ext_diag_ii, /* memory allocated outside */ + HYPRE_Int *B_ext_diag_j, + HYPRE_Complex *B_ext_diag_data, + char *B_ext_diag_xata, /* companion with B_ext_diag_data_ptr; NULL if none */ + HYPRE_Int *B_ext_offd_nnz_ptr, + HYPRE_Int *B_ext_offd_ii, /* memory allocated outside */ + HYPRE_Int *B_ext_offd_j, + HYPRE_Complex *B_ext_offd_data, + char *B_ext_offd_xata /* companion with B_ext_offd_data_ptr; NULL if none */ ) +{ + HYPRE_Int B_ext_diag_nnz; + HYPRE_Int B_ext_offd_nnz; + HYPRE_BigInt *B_ext_diag_bigj = NULL; + HYPRE_BigInt *B_ext_offd_bigj = NULL; + HYPRE_BigInt *col_map_offd_C; + HYPRE_Int *map_B_to_C = NULL; + HYPRE_Int num_cols_offd_C; + + in_range pred1(first_col_diag_B, last_col_diag_B); + + /* get diag and offd nnz */ + if (job == 0) { + /* query the nnz's */ + B_ext_diag_nnz = HYPRE_ONEDPL_CALL( std::count_if, + B_ext_bigj, + B_ext_bigj + B_ext_nnz, + pred1 ); + B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz; + + *B_ext_diag_nnz_ptr = B_ext_diag_nnz; + *B_ext_offd_nnz_ptr = B_ext_offd_nnz; + + return hypre_error_flag; + } + else { + B_ext_diag_nnz = *B_ext_diag_nnz_ptr; + B_ext_offd_nnz = *B_ext_offd_nnz_ptr; + } + + /* copy to diag */ + B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + + if (B_ext_diag_xata) { + auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, /* first */ + first + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, B_ext_diag_xata),/* result */ + pred1 ); + + //hypre_assert( std::get<0>(new_end.get_iterator_tuple() == B_ext_diag_ii + B_ext_diag_nnz ); + } + else { + auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, /* first */ + first + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + oneapi::dpl::make_zip_iterator(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data), /* result */ + pred1 ); + + //hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz ); + } + + HYPRE_BigInt *const_iterator = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, first_col_diag_B, B_ext_diag_nnz*sizeof(HYPRE_BigInt)).wait(); + HYPRE_ONEDPL_CALL( std::transform, + B_ext_diag_bigj, + B_ext_diag_bigj + B_ext_diag_nnz, + const_iterator, //dpct::make_constant_iterator(first_col_diag_B), + B_ext_diag_j, + std::minus() ); + hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE); + + hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE); + + /* copy to offd */ + B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); + + if (B_ext_offd_xata) { + auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data, B_ext_xata); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, /* first */ + first + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, B_ext_offd_xata), /* result */ + std::not_fn(pred1) ); + + // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); + } + else { + auto first = oneapi::dpl::make_zip_iterator(B_ext_ii, B_ext_bigj, B_ext_data); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, /* first */ + first + B_ext_nnz, /* last */ + B_ext_bigj, /* stencil */ + oneapi::dpl::make_zip_iterator(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data), /* result */ + std::not_fn(pred1) ); + + // hypre_assert( std::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); + } + + /* offd map of B_ext_offd Union col_map_offd_B */ + col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(col_map_offd_C, B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B, HYPRE_BigInt, num_cols_offd_B, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + HYPRE_ONEDPL_CALL( std::sort, + col_map_offd_C, + col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); + + HYPRE_BigInt *new_end = HYPRE_ONEDPL_CALL( std::unique, + col_map_offd_C, + col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); + + num_cols_offd_C = new_end - col_map_offd_C; + +#if 1 + HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE, + HYPRE_MEMORY_DEVICE); + hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE); + col_map_offd_C = tmp; +#else + col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, + HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE); +#endif + + /* create map from col_map_offd_B */ + if (num_cols_offd_B) { + map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE); + HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, + col_map_offd_C, + col_map_offd_C + num_cols_offd_C, + col_map_offd_B, + col_map_offd_B + num_cols_offd_B, + map_B_to_C ); + } + + HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, + col_map_offd_C, + col_map_offd_C + num_cols_offd_C, + B_ext_offd_bigj, + B_ext_offd_bigj + B_ext_offd_nnz, + B_ext_offd_j ); + + hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE); + + if (map_B_to_C_ptr) { + *map_B_to_C_ptr = map_B_to_C; + } + *num_cols_offd_C_ptr = num_cols_offd_C; + *col_map_offd_C_ptr = col_map_offd_C; + + return hypre_error_flag; +} + +/* this predicate compares first and second element in a tuple in absolute value */ +/* first is assumed to be complex, second to be real > 0 */ +struct cabsfirst_greaterthan_second_pred +{ + bool operator()(const std::tuple& t) const + { + const HYPRE_Complex i = std::get<0>(t); + const HYPRE_Real j = std::get<1>(t); + + return hypre_cabs(i) > j; + } +}; - hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B, - A_i, A_j, alpha, A_data, NULL, B_i, B_j, beta, B_data, NULL, NULL, - &nnzC, &C_i, &C_j, &C_data); +#endif /* HYPRE_USING_SYCL */ - C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC); - hypre_CSRMatrixI(C) = C_i; - hypre_CSRMatrixJ(C) = C_j; - hypre_CSRMatrixData(C) = C_data; - hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - hypre_SyncCudaComputeStream(hypre_handle()); +#if defined(HYPRE_USING_GPU) - return C; +hypre_CSRMatrix* +hypre_CSRMatrixAddDevice ( HYPRE_Complex alpha, + hypre_CSRMatrix *A, + HYPRE_Complex beta, + hypre_CSRMatrix *B ) +{ + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Int nrows_A = hypre_CSRMatrixNumRows(A); + HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); + HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Complex *B_data = hypre_CSRMatrixData(B); + HYPRE_Int *B_i = hypre_CSRMatrixI(B); + HYPRE_Int *B_j = hypre_CSRMatrixJ(B); + HYPRE_Int nrows_B = hypre_CSRMatrixNumRows(B); + HYPRE_Int ncols_B = hypre_CSRMatrixNumCols(B); + HYPRE_Int nnz_B = hypre_CSRMatrixNumNonzeros(B); + HYPRE_Complex *C_data; + HYPRE_Int *C_i; + HYPRE_Int *C_j; + HYPRE_Int nnzC; + hypre_CSRMatrix *C; + + if (nrows_A != nrows_B || ncols_A != ncols_B) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! Incompatible matrix dimensions!\n"); + + return NULL; + } + + hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B, + A_i, A_j, alpha, A_data, NULL, B_i, B_j, beta, B_data, NULL, NULL, + &nnzC, &C_i, &C_j, &C_data); + + C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC); + hypre_CSRMatrixI(C) = C_i; + hypre_CSRMatrixJ(C) = C_j; + hypre_CSRMatrixData(C) = C_data; + hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; + + hypre_SyncDeviceComputeStream(hypre_handle()); + + return C; } hypre_CSRMatrix* hypre_CSRMatrixMultiplyDevice( hypre_CSRMatrix *A, hypre_CSRMatrix *B) { - HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); - HYPRE_Int nrows_B = hypre_CSRMatrixNumRows(B); - hypre_CSRMatrix *C; + HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); + HYPRE_Int nrows_B = hypre_CSRMatrixNumRows(B); + hypre_CSRMatrix *C; - if (ncols_A != nrows_B) - { - hypre_printf("Warning! incompatible matrix dimensions!\n"); - hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! incompatible matrix dimensions!\n"); + if (ncols_A != nrows_B) + { + hypre_printf("Warning! incompatible matrix dimensions!\n"); + hypre_error_w_msg(HYPRE_ERROR_GENERIC,"Warning! incompatible matrix dimensions!\n"); - return NULL; - } + return NULL; + } - hypreDevice_CSRSpGemm(A, B, &C); + hypreDevice_CSRSpGemm(A, B, &C); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); - return C; + return C; } hypre_CSRMatrix* @@ -184,12 +581,12 @@ hypre_CSRMatrixTripleMultiplyDevice ( hypre_CSRMatrix *A, hypre_CSRMatrix *B, hypre_CSRMatrix *C ) { - hypre_CSRMatrix *BC = hypre_CSRMatrixMultiplyDevice(B, C); - hypre_CSRMatrix *ABC = hypre_CSRMatrixMultiplyDevice(A, BC); + hypre_CSRMatrix *BC = hypre_CSRMatrixMultiplyDevice(B, C); + hypre_CSRMatrix *ABC = hypre_CSRMatrixMultiplyDevice(A, BC); - hypre_CSRMatrixDestroy(BC); + hypre_CSRMatrixDestroy(BC); - return ABC; + return ABC; } HYPRE_Int @@ -200,14 +597,13 @@ hypre_CSRMatrixTriLowerUpperSolveDevice(char uplo, hypre_Vector *u ) { #if defined(HYPRE_USING_CUSPARSE) - hypre_CSRMatrixTriLowerUpperSolveCusparse(uplo, A, l1_norms, f, u); + hypre_CSRMatrixTriLowerUpperSolveCusparse(uplo, A, l1_norms, f, u); #elif defined(HYPRE_USING_ROCSPARSE) - hypre_CSRMatrixTriLowerUpperSolveRocsparse(uplo, A, l1_norms, f, u); + hypre_CSRMatrixTriLowerUpperSolveRocsparse(uplo, A, l1_norms, f, u); #else - hypre_error_w_msg(HYPRE_ERROR_GENERIC, - "hypre_CSRMatrixTriLowerUpperSolveDevice requires configuration with either cusparse or rocsparse\n"); + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "hypre_CSRMatrixTriLowerUpperSolveDevice requires configuration with either cusparse or rocsparse\n"); #endif - return hypre_error_flag; + return hypre_error_flag; } /* split CSR matrix B_ext (extended rows of parcsr B) into diag part and offd part @@ -229,301 +625,105 @@ hypre_CSRMatrixSplitDevice( hypre_CSRMatrix *B_ext, hypre_CSRMatrix **B_ext_diag_ptr, hypre_CSRMatrix **B_ext_offd_ptr ) { - HYPRE_Int num_rows = hypre_CSRMatrixNumRows(B_ext); - HYPRE_Int B_ext_nnz = hypre_CSRMatrixNumNonzeros(B_ext); - - HYPRE_Int *B_ext_ii = hypre_TAlloc(HYPRE_Int, B_ext_nnz, HYPRE_MEMORY_DEVICE); - hypreDevice_CsrRowPtrsToIndices_v2(num_rows, B_ext_nnz, hypre_CSRMatrixI(B_ext), B_ext_ii); - - HYPRE_Int B_ext_diag_nnz; - HYPRE_Int B_ext_offd_nnz; - HYPRE_Int ierr; - - ierr = hypre_CSRMatrixSplitDevice_core( 0, - num_rows, - B_ext_nnz, - NULL, - hypre_CSRMatrixBigJ(B_ext), - NULL, - NULL, - first_col_diag_B, - last_col_diag_B, - num_cols_offd_B, - NULL, - NULL, - NULL, - NULL, - &B_ext_diag_nnz, - NULL, - NULL, - NULL, - NULL, - &B_ext_offd_nnz, - NULL, - NULL, - NULL, - NULL ); - - HYPRE_Int *B_ext_diag_ii = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Int *B_ext_diag_j = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Complex *B_ext_diag_a = hypre_TAlloc(HYPRE_Complex, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - - HYPRE_Int *B_ext_offd_ii = hypre_TAlloc(HYPRE_Int, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Int *B_ext_offd_j = hypre_TAlloc(HYPRE_Int, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - HYPRE_Complex *B_ext_offd_a = hypre_TAlloc(HYPRE_Complex, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - - ierr = hypre_CSRMatrixSplitDevice_core( 1, - num_rows, - B_ext_nnz, - B_ext_ii, - hypre_CSRMatrixBigJ(B_ext), - hypre_CSRMatrixData(B_ext), - NULL, - first_col_diag_B, - last_col_diag_B, - num_cols_offd_B, - col_map_offd_B, - map_B_to_C_ptr, - num_cols_offd_C_ptr, - col_map_offd_C_ptr, - &B_ext_diag_nnz, - B_ext_diag_ii, - B_ext_diag_j, - B_ext_diag_a, - NULL, - &B_ext_offd_nnz, - B_ext_offd_ii, - B_ext_offd_j, - B_ext_offd_a, - NULL ); - - hypre_TFree(B_ext_ii, HYPRE_MEMORY_DEVICE); - - /* convert to row ptrs */ - HYPRE_Int *B_ext_diag_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_diag_nnz, B_ext_diag_ii); - HYPRE_Int *B_ext_offd_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_offd_nnz, B_ext_offd_ii); - - hypre_TFree(B_ext_diag_ii, HYPRE_MEMORY_DEVICE); - hypre_TFree(B_ext_offd_ii, HYPRE_MEMORY_DEVICE); - - /* create diag and offd CSR */ - hypre_CSRMatrix *B_ext_diag = hypre_CSRMatrixCreate(num_rows, - last_col_diag_B - first_col_diag_B + 1, B_ext_diag_nnz); - hypre_CSRMatrix *B_ext_offd = hypre_CSRMatrixCreate(num_rows, *num_cols_offd_C_ptr, B_ext_offd_nnz); - - hypre_CSRMatrixI(B_ext_diag) = B_ext_diag_i; - hypre_CSRMatrixJ(B_ext_diag) = B_ext_diag_j; - hypre_CSRMatrixData(B_ext_diag) = B_ext_diag_a; - hypre_CSRMatrixNumNonzeros(B_ext_diag) = B_ext_diag_nnz; - hypre_CSRMatrixMemoryLocation(B_ext_diag) = HYPRE_MEMORY_DEVICE; - - hypre_CSRMatrixI(B_ext_offd) = B_ext_offd_i; - hypre_CSRMatrixJ(B_ext_offd) = B_ext_offd_j; - hypre_CSRMatrixData(B_ext_offd) = B_ext_offd_a; - hypre_CSRMatrixNumNonzeros(B_ext_offd) = B_ext_offd_nnz; - hypre_CSRMatrixMemoryLocation(B_ext_offd) = HYPRE_MEMORY_DEVICE; - - *B_ext_diag_ptr = B_ext_diag; - *B_ext_offd_ptr = B_ext_offd; - - hypre_SyncCudaComputeStream(hypre_handle()); - - return ierr; -} - -HYPRE_Int -hypre_CSRMatrixSplitDevice_core( HYPRE_Int - job, /* 0: query B_ext_diag_nnz and B_ext_offd_nnz; 1: the real computation */ - HYPRE_Int num_rows, - HYPRE_Int B_ext_nnz, - HYPRE_Int - *B_ext_ii, /* Note: this is NOT row pointers as in CSR but row indices as in COO */ - HYPRE_BigInt *B_ext_bigj, /* Note: [BigInt] global column indices */ - HYPRE_Complex *B_ext_data, - char *B_ext_xata, /* companion data with B_ext_data; NULL if none */ - HYPRE_BigInt first_col_diag_B, - HYPRE_BigInt last_col_diag_B, - HYPRE_Int num_cols_offd_B, - HYPRE_BigInt *col_map_offd_B, - HYPRE_Int **map_B_to_C_ptr, - HYPRE_Int *num_cols_offd_C_ptr, - HYPRE_BigInt **col_map_offd_C_ptr, - HYPRE_Int *B_ext_diag_nnz_ptr, - HYPRE_Int *B_ext_diag_ii, /* memory allocated outside */ - HYPRE_Int *B_ext_diag_j, - HYPRE_Complex *B_ext_diag_data, - char *B_ext_diag_xata, /* companion with B_ext_diag_data_ptr; NULL if none */ - HYPRE_Int *B_ext_offd_nnz_ptr, - HYPRE_Int *B_ext_offd_ii, /* memory allocated outside */ - HYPRE_Int *B_ext_offd_j, - HYPRE_Complex *B_ext_offd_data, - char *B_ext_offd_xata /* companion with B_ext_offd_data_ptr; NULL if none */ ) -{ - HYPRE_Int B_ext_diag_nnz; - HYPRE_Int B_ext_offd_nnz; - HYPRE_BigInt *B_ext_diag_bigj = NULL; - HYPRE_BigInt *B_ext_offd_bigj = NULL; - HYPRE_BigInt *col_map_offd_C; - HYPRE_Int *map_B_to_C = NULL; - HYPRE_Int num_cols_offd_C; - - in_range pred1(first_col_diag_B, last_col_diag_B); - - /* get diag and offd nnz */ - if (job == 0) - { - /* query the nnz's */ - B_ext_diag_nnz = HYPRE_THRUST_CALL( count_if, - B_ext_bigj, - B_ext_bigj + B_ext_nnz, - pred1 ); - B_ext_offd_nnz = B_ext_nnz - B_ext_diag_nnz; - - *B_ext_diag_nnz_ptr = B_ext_diag_nnz; - *B_ext_offd_nnz_ptr = B_ext_offd_nnz; - - return hypre_error_flag; - } - else - { - B_ext_diag_nnz = *B_ext_diag_nnz_ptr; - B_ext_offd_nnz = *B_ext_offd_nnz_ptr; - } - - /* copy to diag */ - B_ext_diag_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); - - if (B_ext_diag_xata) - { - auto new_end = HYPRE_THRUST_CALL( - copy_if, - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, - B_ext_xata)), /* first */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, - B_ext_xata)) + B_ext_nnz, /* last */ - B_ext_bigj, /* stencil */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, B_ext_diag_data, - B_ext_diag_xata)), /* result */ - pred1 ); - - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz ); - } - else - { - auto new_end = HYPRE_THRUST_CALL( - copy_if, - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, - B_ext_data)), /* first */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, - B_ext_data)) + B_ext_nnz, /* last */ - B_ext_bigj, /* stencil */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_diag_ii, B_ext_diag_bigj, - B_ext_diag_data)), /* result */ - pred1 ); - - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_diag_ii + B_ext_diag_nnz ); - } - - HYPRE_THRUST_CALL( transform, - B_ext_diag_bigj, - B_ext_diag_bigj + B_ext_diag_nnz, - thrust::make_constant_iterator(first_col_diag_B), - B_ext_diag_j, - thrust::minus()); - - hypre_TFree(B_ext_diag_bigj, HYPRE_MEMORY_DEVICE); - - /* copy to offd */ - B_ext_offd_bigj = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); - - if (B_ext_offd_xata) - { - auto new_end = HYPRE_THRUST_CALL( - copy_if, - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, - B_ext_xata)), /* first */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, B_ext_data, - B_ext_xata)) + B_ext_nnz, /* last */ - B_ext_bigj, /* stencil */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, B_ext_offd_data, - B_ext_offd_xata)), /* result */ - thrust::not1(pred1) ); - - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); - } - else - { - auto new_end = HYPRE_THRUST_CALL( - copy_if, - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, - B_ext_data)), /* first */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_ii, B_ext_bigj, - B_ext_data)) + B_ext_nnz, /* last */ - B_ext_bigj, /* stencil */ - thrust::make_zip_iterator(thrust::make_tuple(B_ext_offd_ii, B_ext_offd_bigj, - B_ext_offd_data)), /* result */ - thrust::not1(pred1) ); - - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == B_ext_offd_ii + B_ext_offd_nnz ); - } - - /* offd map of B_ext_offd Union col_map_offd_B */ - col_map_offd_C = hypre_TAlloc(HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(col_map_offd_C, B_ext_offd_bigj, HYPRE_BigInt, B_ext_offd_nnz, - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(col_map_offd_C + B_ext_offd_nnz, col_map_offd_B, HYPRE_BigInt, num_cols_offd_B, - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - - HYPRE_THRUST_CALL( sort, - col_map_offd_C, - col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); - - HYPRE_BigInt *new_end = HYPRE_THRUST_CALL( unique, - col_map_offd_C, - col_map_offd_C + B_ext_offd_nnz + num_cols_offd_B ); - - num_cols_offd_C = new_end - col_map_offd_C; - -#if 1 - HYPRE_BigInt *tmp = hypre_TAlloc(HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(tmp, col_map_offd_C, HYPRE_BigInt, num_cols_offd_C, HYPRE_MEMORY_DEVICE, - HYPRE_MEMORY_DEVICE); - hypre_TFree(col_map_offd_C, HYPRE_MEMORY_DEVICE); - col_map_offd_C = tmp; -#else - col_map_offd_C = hypre_TReAlloc_v2(col_map_offd_C, HYPRE_BigInt, B_ext_offd_nnz + num_cols_offd_B, - HYPRE_Int, num_cols_offd_C, HYPRE_MEMORY_DEVICE); -#endif - - /* create map from col_map_offd_B */ - if (num_cols_offd_B) - { - map_B_to_C = hypre_TAlloc(HYPRE_Int, num_cols_offd_B, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL( lower_bound, - col_map_offd_C, - col_map_offd_C + num_cols_offd_C, - col_map_offd_B, - col_map_offd_B + num_cols_offd_B, - map_B_to_C ); - } - - HYPRE_THRUST_CALL( lower_bound, - col_map_offd_C, - col_map_offd_C + num_cols_offd_C, - B_ext_offd_bigj, - B_ext_offd_bigj + B_ext_offd_nnz, - B_ext_offd_j ); - - hypre_TFree(B_ext_offd_bigj, HYPRE_MEMORY_DEVICE); - - if (map_B_to_C_ptr) - { - *map_B_to_C_ptr = map_B_to_C; - } - *num_cols_offd_C_ptr = num_cols_offd_C; - *col_map_offd_C_ptr = col_map_offd_C; - - return hypre_error_flag; + HYPRE_Int num_rows = hypre_CSRMatrixNumRows(B_ext); + HYPRE_Int B_ext_nnz = hypre_CSRMatrixNumNonzeros(B_ext); + + HYPRE_Int *B_ext_ii = hypre_TAlloc(HYPRE_Int, B_ext_nnz, HYPRE_MEMORY_DEVICE); + hypreDevice_CsrRowPtrsToIndices_v2(num_rows, B_ext_nnz, hypre_CSRMatrixI(B_ext), B_ext_ii); + + HYPRE_Int B_ext_diag_nnz; + HYPRE_Int B_ext_offd_nnz; + HYPRE_Int ierr; + + ierr = hypre_CSRMatrixSplitDevice_core( 0, + num_rows, + B_ext_nnz, + NULL, + hypre_CSRMatrixBigJ(B_ext), + NULL, + NULL, + first_col_diag_B, + last_col_diag_B, + num_cols_offd_B, + NULL, + NULL, + NULL, + NULL, + &B_ext_diag_nnz, + NULL, + NULL, + NULL, + NULL, + &B_ext_offd_nnz, + NULL, + NULL, + NULL, + NULL ); + + HYPRE_Int *B_ext_diag_ii = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Int *B_ext_diag_j = hypre_TAlloc(HYPRE_Int, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Complex *B_ext_diag_a = hypre_TAlloc(HYPRE_Complex, B_ext_diag_nnz, HYPRE_MEMORY_DEVICE); + + HYPRE_Int *B_ext_offd_ii = hypre_TAlloc(HYPRE_Int, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Int *B_ext_offd_j = hypre_TAlloc(HYPRE_Int, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); + HYPRE_Complex *B_ext_offd_a = hypre_TAlloc(HYPRE_Complex, B_ext_offd_nnz, HYPRE_MEMORY_DEVICE); + + ierr = hypre_CSRMatrixSplitDevice_core( 1, + num_rows, + B_ext_nnz, + B_ext_ii, + hypre_CSRMatrixBigJ(B_ext), + hypre_CSRMatrixData(B_ext), + NULL, + first_col_diag_B, + last_col_diag_B, + num_cols_offd_B, + col_map_offd_B, + map_B_to_C_ptr, + num_cols_offd_C_ptr, + col_map_offd_C_ptr, + &B_ext_diag_nnz, + B_ext_diag_ii, + B_ext_diag_j, + B_ext_diag_a, + NULL, + &B_ext_offd_nnz, + B_ext_offd_ii, + B_ext_offd_j, + B_ext_offd_a, + NULL ); + + hypre_TFree(B_ext_ii, HYPRE_MEMORY_DEVICE); + + /* convert to row ptrs */ + HYPRE_Int *B_ext_diag_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_diag_nnz, B_ext_diag_ii); + HYPRE_Int *B_ext_offd_i = hypreDevice_CsrRowIndicesToPtrs(num_rows, B_ext_offd_nnz, B_ext_offd_ii); + + hypre_TFree(B_ext_diag_ii, HYPRE_MEMORY_DEVICE); + hypre_TFree(B_ext_offd_ii, HYPRE_MEMORY_DEVICE); + + /* create diag and offd CSR */ + hypre_CSRMatrix *B_ext_diag = hypre_CSRMatrixCreate(num_rows, last_col_diag_B - first_col_diag_B + 1, B_ext_diag_nnz); + hypre_CSRMatrix *B_ext_offd = hypre_CSRMatrixCreate(num_rows, *num_cols_offd_C_ptr, B_ext_offd_nnz); + + hypre_CSRMatrixI(B_ext_diag) = B_ext_diag_i; + hypre_CSRMatrixJ(B_ext_diag) = B_ext_diag_j; + hypre_CSRMatrixData(B_ext_diag) = B_ext_diag_a; + hypre_CSRMatrixNumNonzeros(B_ext_diag) = B_ext_diag_nnz; + hypre_CSRMatrixMemoryLocation(B_ext_diag) = HYPRE_MEMORY_DEVICE; + + hypre_CSRMatrixI(B_ext_offd) = B_ext_offd_i; + hypre_CSRMatrixJ(B_ext_offd) = B_ext_offd_j; + hypre_CSRMatrixData(B_ext_offd) = B_ext_offd_a; + hypre_CSRMatrixNumNonzeros(B_ext_offd) = B_ext_offd_nnz; + hypre_CSRMatrixMemoryLocation(B_ext_offd) = HYPRE_MEMORY_DEVICE; + + *B_ext_diag_ptr = B_ext_diag; + *B_ext_offd_ptr = B_ext_offd; + + hypre_SyncDeviceComputeStream(hypre_handle()); + + return ierr; } /*-------------------------------------------------------------------------- @@ -541,149 +741,168 @@ hypre_CSRMatrixAddPartialDevice( hypre_CSRMatrix *A, hypre_CSRMatrix *B, HYPRE_Int *row_nums) { - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Int nrows_A = hypre_CSRMatrixNumRows(A); - HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); - HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Complex *B_data = hypre_CSRMatrixData(B); - HYPRE_Int *B_i = hypre_CSRMatrixI(B); - HYPRE_Int *B_j = hypre_CSRMatrixJ(B); - HYPRE_Int nrows_B = hypre_CSRMatrixNumRows(B); - HYPRE_Int ncols_B = hypre_CSRMatrixNumCols(B); - HYPRE_Int nnz_B = hypre_CSRMatrixNumNonzeros(B); - HYPRE_Complex *C_data; - HYPRE_Int *C_i; - HYPRE_Int *C_j; - HYPRE_Int nnzC; - hypre_CSRMatrix *C; - - if (ncols_A != ncols_B) - { - hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! incompatible matrix dimensions!\n"); - - return NULL; - } - - hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B, A_i, A_j, 1.0, A_data, NULL, B_i, B_j, - 1.0, B_data, NULL, row_nums, - &nnzC, &C_i, &C_j, &C_data); - - C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC); - hypre_CSRMatrixI(C) = C_i; - hypre_CSRMatrixJ(C) = C_j; - hypre_CSRMatrixData(C) = C_data; - hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - - hypre_SyncCudaComputeStream(hypre_handle()); - - return C; + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Int nrows_A = hypre_CSRMatrixNumRows(A); + HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); + HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Complex *B_data = hypre_CSRMatrixData(B); + HYPRE_Int *B_i = hypre_CSRMatrixI(B); + HYPRE_Int *B_j = hypre_CSRMatrixJ(B); + HYPRE_Int nrows_B = hypre_CSRMatrixNumRows(B); + HYPRE_Int ncols_B = hypre_CSRMatrixNumCols(B); + HYPRE_Int nnz_B = hypre_CSRMatrixNumNonzeros(B); + HYPRE_Complex *C_data; + HYPRE_Int *C_i; + HYPRE_Int *C_j; + HYPRE_Int nnzC; + hypre_CSRMatrix *C; + + if (ncols_A != ncols_B) + { + hypre_error_w_msg(HYPRE_ERROR_GENERIC, "Warning! Incompatible matrix dimensions!\n"); + + return NULL; + } + + hypreDevice_CSRSpAdd(nrows_A, nrows_B, ncols_A, nnz_A, nnz_B, A_i, A_j, 1.0, A_data, NULL, B_i, B_j, 1.0, B_data, NULL, row_nums, + &nnzC, &C_i, &C_j, &C_data); + + C = hypre_CSRMatrixCreate(nrows_A, ncols_B, nnzC); + hypre_CSRMatrixI(C) = C_i; + hypre_CSRMatrixJ(C) = C_j; + hypre_CSRMatrixData(C) = C_data; + hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; + + hypre_SyncDeviceComputeStream(hypre_handle()); + + return C; } HYPRE_Int hypre_CSRMatrixColNNzRealDevice( hypre_CSRMatrix *A, HYPRE_Real *colnnz) { - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); - HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int *A_j_sorted; - HYPRE_Int num_reduced_col_indices; - HYPRE_Int *reduced_col_indices; - HYPRE_Int *reduced_col_nnz; - - A_j_sorted = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(A_j_sorted, A_j, HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL(sort, A_j_sorted, A_j_sorted + nnz_A); - - reduced_col_indices = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); - reduced_col_nnz = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); + HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int *A_j_sorted; + HYPRE_Int num_reduced_col_indices; + HYPRE_Int *reduced_col_indices; + HYPRE_Int *reduced_col_nnz; + reduced_col_indices = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); + reduced_col_nnz = hypre_TAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_DEVICE); + + A_j_sorted = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(A_j_sorted, A_j, HYPRE_Int, nnz_A, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + +#ifdef HYPRE_USING_SYCL + HYPRE_ONEDPL_CALL(std::sort, A_j_sorted, A_j_sorted + nnz_A); + + HYPRE_Int* values = hypre_TAlloc(HYPRE_Int, nnz_A, HYPRE_MEMORY_UNIFIED); + hypre_HandleComputeStream(hypre_handle())->fill(values, 1, nnz_A*sizeof(HYPRE_Int)).wait(); + std::pair new_end = + HYPRE_ONEDPL_CALL( oneapi::dpl::reduce_by_segment, A_j_sorted, A_j_sorted + nnz_A, + values, + reduced_col_indices, + reduced_col_nnz ); + + hypre_TFree(values, HYPRE_MEMORY_UNIFIED); +#else + HYPRE_THRUST_CALL(sort, A_j_sorted, A_j_sorted + nnz_A); - thrust::pair new_end = - HYPRE_THRUST_CALL(reduce_by_key, A_j_sorted, A_j_sorted + nnz_A, - thrust::make_constant_iterator(1), - reduced_col_indices, - reduced_col_nnz); + thrust::pair new_end = + HYPRE_THRUST_CALL(reduce_by_key, A_j_sorted, A_j_sorted + nnz_A, + thrust::make_constant_iterator(1), + reduced_col_indices, + reduced_col_nnz); +#endif - hypre_assert(new_end.first - reduced_col_indices == new_end.second - reduced_col_nnz); + hypre_assert(new_end.first - reduced_col_indices == new_end.second - reduced_col_nnz); - num_reduced_col_indices = new_end.first - reduced_col_indices; + num_reduced_col_indices = new_end.first - reduced_col_indices; - hypre_Memset(colnnz, 0, ncols_A * sizeof(HYPRE_Real), HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL(scatter, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices, - reduced_col_indices, colnnz); + hypre_Memset(colnnz, 0, ncols_A * sizeof(HYPRE_Real), HYPRE_MEMORY_DEVICE); +#ifdef HYPRE_USING_SYCL + HYPRE_ONEDPL_CALL( oneapi::dpl::copy, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices, + oneapi::dpl::make_permutation_iterator(colnnz, reduced_col_indices) ); +#else + HYPRE_THRUST_CALL(scatter, reduced_col_nnz, reduced_col_nnz + num_reduced_col_indices, + reduced_col_indices, colnnz); +#endif - hypre_TFree(A_j_sorted, HYPRE_MEMORY_DEVICE); - hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE); - hypre_TFree(reduced_col_nnz, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_j_sorted, HYPRE_MEMORY_DEVICE); + hypre_TFree(reduced_col_indices, HYPRE_MEMORY_DEVICE); + hypre_TFree(reduced_col_nnz, HYPRE_MEMORY_DEVICE); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); - return hypre_error_flag; + return hypre_error_flag; } __global__ void -hypreCUDAKernel_CSRMoveDiagFirst( HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *aa ) +hypreGPUKernel_CSRMoveDiagFirst( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, + #endif + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *aa ) { - HYPRE_Int row = hypre_cuda_get_grid_warp_id<1, 1>(); - - if (row >= nrows) - { - return; - } - - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; - - if (lane < 2) - { - p = read_only_load(ia + row + lane); - } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - - for (HYPRE_Int j = p + lane + 1; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && ja[j] == row; - - if (find_diag) - { - ja[j] = ja[p]; - ja[p] = row; - HYPRE_Complex tmp = aa[p]; - aa[p] = aa[j]; - aa[j] = tmp; - } - - if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) - { - break; - } - } -} +#ifdef HYPRE_USING_SYCL + HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + sycl::sub_group SG = item.get_sub_group(); +#else + HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); +#endif -HYPRE_Int -hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) -{ - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - dim3 bDim, gDim; + if (row >= nrows) + { + return; + } - bDim = hypre_GetDefaultDeviceBlockDimension(); - gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); + HYPRE_Int p = 0, q = 0; - HYPRE_CUDA_LAUNCH(hypreCUDAKernel_CSRMoveDiagFirst, gDim, bDim, - nrows, A_i, A_j, A_data); + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } +#ifdef HYPRE_USING_SYCL + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); - hypre_SyncCudaComputeStream(hypre_handle()); + for (HYPRE_Int j = p + lane + 1; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0)) +#else + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - return hypre_error_flag; + for (HYPRE_Int j = p + lane + 1; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) +#endif + { + hypre_int find_diag = j < q && ja[j] == row; + + if (find_diag) + { + ja[j] = ja[p]; + ja[p] = row; + HYPRE_Complex tmp = aa[p]; + aa[p] = aa[j]; + aa[j] = tmp; + } + +#ifdef HYPRE_USING_SYCL + if ( sycl::any_of_group(SG, find_diag) ) +#else + if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) +#endif + { + break; + } + } } /* check if diagonal entry is the first one at each row @@ -691,503 +910,729 @@ hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) * RL: only check if it's a non-empty row */ __global__ void -hypreCUDAKernel_CSRCheckDiagFirst( HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Int *result ) +hypreGPUKernel_CSRCheckDiagFirst( +#ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, +#endif + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Int *result ) { - const HYPRE_Int row = hypre_cuda_get_grid_thread_id<1, 1>(); - if (row < nrows) - { - result[row] = (ia[row + 1] > ia[row]) && (ja[ia[row]] != row); - } +#ifdef HYPRE_USING_SYCL + const HYPRE_Int row = hypre_gpu_get_grid_thread_id<1,1>(item); +#else + const HYPRE_Int row = hypre_cuda_get_grid_thread_id<1,1>(); +#endif + if (row < nrows) + { + result[row] = (ia[row+1] > ia[row]) && (ja[ia[row]] != row); + } } -HYPRE_Int -hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A ) +__global__ void +hypreGPUKernel_CSRMatrixFixZeroDiagDevice( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, + #endif + HYPRE_Complex v, + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *data, + HYPRE_Real tol, + HYPRE_Int *result ) { - if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) - { - return 0; - } +#ifdef HYPRE_USING_SYCL + const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + sycl::sub_group SG = item.get_sub_group(); +#else + const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); +#endif + + if (row >= nrows) + { + return; + } - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim); + HYPRE_Int p = 0, q = 0; + bool has_diag = false; - HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRCheckDiagFirst, gDim, bDim, - hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result ); + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } - HYPRE_Int ierr = HYPRE_THRUST_CALL( reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); +#ifdef HYPRE_USING_SYCL + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); + + for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0)) +#else + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - hypre_TFree(result, HYPRE_MEMORY_DEVICE); + for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) +#endif + { + hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; - hypre_SyncCudaComputeStream(hypre_handle()); + if (find_diag) + { + if (fabs(data[j]) <= tol) + { + data[j] = v; + } + } - return ierr; +#ifdef HYPRE_USING_SYCL + if ( sycl::any_of_group(SG, find_diag) ) +#else + if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) +#endif + { + has_diag = true; + break; + } + } + + if (result && !has_diag && lane == 0) + { + result[row] = 1; + } } __global__ void -hypreCUDAKernel_CSRMatrixFixZeroDiagDevice( HYPRE_Complex v, - HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *data, - HYPRE_Real tol, - HYPRE_Int *result ) +hypreGPUKernel_CSRMatrixReplaceDiagDevice( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, + #endif + HYPRE_Complex *new_diag, + HYPRE_Complex v, + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *data, + HYPRE_Real tol, + HYPRE_Int *result ) { - const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1, 1>(); - - if (row >= nrows) - { - return; - } - - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; - bool has_diag = false; - - if (lane < 2) - { - p = read_only_load(ia + row + lane); - } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - - for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; +#ifdef HYPRE_USING_SYCL + const HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + sycl::sub_group SG = item.get_sub_group(); +#else + const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); +#endif + if (row >= nrows) + { + return; + } + + HYPRE_Int p = 0, q = 0; + bool has_diag = false; + + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } +#ifdef HYPRE_USING_SYCL + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); + + for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0)) +#else + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - if (find_diag) - { - if (fabs(data[j]) <= tol) - { - data[j] = v; - } - } + for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) +#endif + { + hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; - if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) + if (find_diag) + { + HYPRE_Complex d = read_only_load(&new_diag[row]); + if (fabs(d) <= tol) { - has_diag = true; - break; + d = v; } - } + data[j] = d; + } - if (result && !has_diag && lane == 0) - { - result[row] = 1; - } +#ifdef HYPRE_USING_SYCL + if ( sycl::any_of_group(SG, find_diag) ) +#else + if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) +#endif + { + has_diag = true; + break; + } + } + + if (result && !has_diag && lane == 0) + { + result[row] = 1; + } } -/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v - * Does NOT assume diagonal is the first entry of each row of A - * In debug mode: - * Returns the number of rows that do not have diag in the pattern - * (i.e., structural zeroes on the diagonal) +/* type == 0, sum, + * 1, abs sum (l-1) + * 2, square sum (l-2) */ -HYPRE_Int -hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A, - HYPRE_Complex v, - HYPRE_Real tol ) +template +__global__ void +hypreGPUKernel_CSRRowSum( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, + #endif + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *aa, + HYPRE_Int *CF_i, + HYPRE_Int *CF_j, + HYPRE_Complex *row_sum, + HYPRE_Complex scal, + HYPRE_Int set) { - HYPRE_Int ierr = 0; - - if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) - { - return ierr; - } - - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); - -#if HYPRE_DEBUG - HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); +#ifdef HYPRE_USING_SYCL + HYPRE_Int row_i = hypre_gpu_get_grid_warp_id<1,1>(item); + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + sycl::sub_group SG = item.get_sub_group(); #else - HYPRE_Int *result = NULL; + HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); #endif + if (row_i >= nrows) + { + return; + } - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim, - v, hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), - tol, result ); + HYPRE_Int p = 0, q = 0; -#if HYPRE_DEBUG - ierr = HYPRE_THRUST_CALL( reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); + if (lane < 2) + { + p = read_only_load(ia + row_i + lane); + } - hypre_TFree(result, HYPRE_MEMORY_DEVICE); -#endif + HYPRE_Complex row_sum_i = 0.0; + +#ifdef HYPRE_USING_SYCL + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); - hypre_SyncCudaComputeStream(hypre_handle()); + for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0)) +#else + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - return ierr; + for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) +#endif + { + if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) ) + { + continue; + } + + HYPRE_Complex aii = aa[j]; + + if (type == 0) + { + row_sum_i += aii; + } + else if (type == 1) + { + row_sum_i += fabs(aii); + } + else if (type == 2) + { + row_sum_i += aii * aii; + } + } +#ifdef HYPRE_USING_SYCL + row_sum_i = warp_reduce_sum(row_sum_i, item); +#else + row_sum_i = warp_reduce_sum(row_sum_i); +#endif + if (lane == 0) + { + if (set) + { + row_sum[row_i] = scal * row_sum_i; + } + else + { + row_sum[row_i] += scal * row_sum_i; + } + } } +/* type 0: diag + * 1: abs diag + * 2: diag inverse + * 3: diag inverse sqrt + * 4: abs diag inverse sqrt + */ __global__ void -hypreCUDAKernel_CSRMatrixReplaceDiagDevice( HYPRE_Complex *new_diag, - HYPRE_Complex v, - HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *data, - HYPRE_Real tol, - HYPRE_Int *result ) +hypreGPUKernel_CSRExtractDiag( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, + #endif + HYPRE_Int nrows, + HYPRE_Int *ia, + HYPRE_Int *ja, + HYPRE_Complex *aa, + HYPRE_Complex *d, + HYPRE_Int type) { - const HYPRE_Int row = hypre_cuda_get_grid_warp_id<1, 1>(); +#ifdef HYPRE_USING_SYCL + HYPRE_Int row = hypre_gpu_get_grid_warp_id<1,1>(item); + HYPRE_Int lane = hypre_gpu_get_lane_id<1>(item); + sycl::sub_group SG = item.get_sub_group(); +#else + HYPRE_Int row = hypre_cuda_get_grid_warp_id<1,1>(); + HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); +#endif + if (row >= nrows) + { + return; + } + + HYPRE_Int p = 0, q = 0; + + if (lane < 2) + { + p = read_only_load(ia + row + lane); + } + HYPRE_Int has_diag = 0; +#ifdef HYPRE_USING_SYCL + q = SG.shuffle(p, 1); + p = SG.shuffle(p, 0); + + for (HYPRE_Int j = p + lane; sycl::any_of_group(SG, j < q); j += SG.get_local_range().get(0)) +#else + q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); + p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); - if (row >= nrows) - { - return; - } + for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) +#endif + { + hypre_int find_diag = j < q && ja[j] == row; - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; - bool has_diag = false; + if (find_diag) + { + if (type == 0) + { + d[row] = aa[j]; + } + else if (type == 1) + { + d[row] = fabs(aa[j]); + } + else if (type == 2) + { + d[row] = 1.0 / aa[j]; + } + else if (type == 3) + { + d[row] = 1.0 / sqrt(aa[j]); + } + else if (type == 4) + { + d[row] = 1.0 / sqrt(fabs(aa[j])); + } + } - if (lane < 2) - { - p = read_only_load(ia + row + lane); - } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); +#ifdef HYPRE_USING_SYCL + if ( sycl::any_of_group(SG, find_diag) ) +#else + if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) +#endif + { + has_diag = 1; + break; + } + } + + if (!has_diag && lane == 0) + { + d[row] = 0.0; + } +} - for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && read_only_load(&ja[j]) == row; +/* mark is of size nA + * diag_option: 1: special treatment for diag entries, mark as -2 + */ +__global__ void +hypreGPUKernel_CSRMatrixIntersectPattern( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1>& item, + #endif + HYPRE_Int n, + HYPRE_Int nA, + HYPRE_Int *rowid, + HYPRE_Int *colid, + HYPRE_Int *idx, + HYPRE_Int *mark, + HYPRE_Int diag_option) +{ +#ifdef HYPRE_USING_SYCL + HYPRE_Int i = hypre_gpu_get_grid_thread_id<1,1>(item); +#else + HYPRE_Int i = hypre_cuda_get_grid_thread_id<1,1>(); +#endif - if (find_diag) + if (i >= n) + { + return; + } + + HYPRE_Int r1 = read_only_load(&rowid[i]); + HYPRE_Int c1 = read_only_load(&colid[i]); + HYPRE_Int j = read_only_load(&idx[i]); + + if (0 == diag_option) + { + if (j < nA) + { + HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; + HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; + if (r1 == r2 && c1 == c2) { - HYPRE_Complex d = read_only_load(&new_diag[row]); - if (fabs(d) <= tol) - { - d = v; - } - data[j] = d; + mark[j] = c1; } - - if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) + else { - has_diag = true; - break; + mark[j] = -1; } - } - - if (result && !has_diag && lane == 0) - { - result[row] = 1; - } + } + } + else if (1 == diag_option) + { + if (j < nA) + { + if (r1 == c1) + { + mark[j] = -2; + } + else + { + HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; + HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; + if (r1 == r2 && c1 == c2) + { + mark[j] = c1; + } + else + { + mark[j] = -1; + } + } + } + } } +/* For square A, find numerical zeros (absolute values <= tol) on its diagonal and replace with v + * Does NOT assume diagonal is the first entry of each row of A + * In debug mode: + * Returns the number of rows that do not have diag in the pattern + * (i.e., structural zeroes on the diagonal) + */ HYPRE_Int -hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, - HYPRE_Complex *new_diag, +hypre_CSRMatrixFixZeroDiagDevice( hypre_CSRMatrix *A, HYPRE_Complex v, HYPRE_Real tol ) { - HYPRE_Int ierr = 0; + HYPRE_Int ierr = 0; - if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) - { - return ierr; - } + if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + { + return ierr; + } - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); #if HYPRE_DEBUG - HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); + HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); #else - HYPRE_Int *result = NULL; + HYPRE_Int *result = NULL; #endif - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixReplaceDiagDevice, gDim, bDim, - new_diag, v, hypre_CSRMatrixNumRows(A), - hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), - tol, result ); + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixFixZeroDiagDevice, gDim, bDim, + v, hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), + tol, result ); #if HYPRE_DEBUG - ierr = HYPRE_THRUST_CALL( reduce, - result, - result + hypre_CSRMatrixNumRows(A) ); - - hypre_TFree(result, HYPRE_MEMORY_DEVICE); +#if defined(HYPRE_USING_CUDA) + ierr = HYPRE_THRUST_CALL( reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#elif defined(HYPRE_USING_SYCL) + ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); #endif + hypre_TFree(result, HYPRE_MEMORY_DEVICE); +#endif // HYPRE_DEBUG - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); - return ierr; + return ierr; } -typedef thrust::tuple Int2; -struct Int2Unequal : public thrust::unary_function -{ - __host__ __device__ - bool operator()(const Int2& t) const - { - return (thrust::get<0>(t) != thrust::get<1>(t)); - } -}; - HYPRE_Int -hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A) +hypre_CSRMatrixReplaceDiagDevice( hypre_CSRMatrix *A, + HYPRE_Complex *new_diag, + HYPRE_Complex v, + HYPRE_Real tol ) { - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); - HYPRE_Int new_nnz; - HYPRE_Int *new_ii; - HYPRE_Int *new_j; - HYPRE_Complex *new_data; - - new_nnz = HYPRE_THRUST_CALL( count_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz, - Int2Unequal() ); - - if (new_nnz == nnz) - { - /* no diagonal entries found */ - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - return hypre_error_flag; - } + HYPRE_Int ierr = 0; - new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + { + return ierr; + } - if (A_data) - { - new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "warp", bDim); - thrust::zip_iterator< thrust::tuple > new_end; +#if HYPRE_DEBUG + HYPRE_Int *result = hypre_CTAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); +#else + HYPRE_Int *result = NULL; +#endif - new_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), - thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), - Int2Unequal() ); + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixReplaceDiagDevice, gDim, bDim, + new_diag, v, hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), hypre_CSRMatrixData(A), + tol, result ); - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); - } - else - { - new_data = NULL; +#if HYPRE_DEBUG +#if defined(HYPRE_USING_CUDA) + ierr = HYPRE_THRUST_CALL( reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#elif defined(HYPRE_USING_SYCL) + ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#endif + hypre_TFree(result, HYPRE_MEMORY_DEVICE); +#endif // HYPRE_DEBUG - thrust::zip_iterator< thrust::tuple > new_end; + hypre_SyncDeviceComputeStream(hypre_handle()); - new_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), - thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j)), - Int2Unequal() ); + return ierr; +} - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); - } +HYPRE_Int +hypre_CSRMatrixRemoveDiagonalDevice(hypre_CSRMatrix *A) +{ + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); + HYPRE_Int new_nnz; + HYPRE_Int *new_ii; + HYPRE_Int *new_j; + HYPRE_Complex *new_data; + +#ifdef HYPRE_USING_SYCL + auto zipped_begin = oneapi::dpl::make_zip_iterator(A_ii, A_j); + new_nnz = HYPRE_ONEDPL_CALL( std::count_if, + zipped_begin, zipped_begin + nnz, + [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); +#else + new_nnz = HYPRE_THRUST_CALL( count_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz, + Int2Unequal() ); +#endif + + if (new_nnz == nnz) + { + /* no diagonal entries found */ + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + return hypre_error_flag; + } + + new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + + if (A_data) + { + new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); + +#ifdef HYPRE_USING_SYCL + auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, first + nnz, + oneapi::dpl::make_zip_iterator(A_ii, A_j), + oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data), + [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); + // todo: fix this + // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz ); +#else + thrust::zip_iterator< thrust::tuple > new_end; + new_end = HYPRE_THRUST_CALL( copy_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), + thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), + Int2Unequal() ); + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); +#endif + } + else + { + new_data = NULL; +#ifdef HYPRE_USING_SYCL + auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j); + auto new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, first + nnz, + first, + oneapi::dpl::make_zip_iterator(new_ii, new_j), + [](auto t) { return std::get<0>(t) != std::get<1>(t); } ); + // TODO: abb fix this + // hypre_assert( std::get<0>(*new_end) == new_ii + new_nnz ); +#else + thrust::zip_iterator< thrust::tuple > new_end; + new_end = HYPRE_THRUST_CALL( copy_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)) + nnz, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j)), + thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j)), + Int2Unequal() ); + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); +#endif + } - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); - hypre_CSRMatrixNumNonzeros(A) = new_nnz; - hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); - hypre_CSRMatrixJ(A) = new_j; - hypre_CSRMatrixData(A) = new_data; - hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); + hypre_CSRMatrixNumNonzeros(A) = new_nnz; + hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); + hypre_CSRMatrixJ(A) = new_j; + hypre_CSRMatrixData(A) = new_data; + hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); - return hypre_error_flag; + return hypre_error_flag; } -/* type == 0, sum, - * 1, abs sum (l-1) - * 2, square sum (l-2) - */ -template -__global__ void -hypreCUDAKernel_CSRRowSum( HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *aa, - HYPRE_Int *CF_i, - HYPRE_Int *CF_j, - HYPRE_Complex *row_sum, - HYPRE_Complex scal, - HYPRE_Int set) +HYPRE_Int +hypre_CSRMatrixCheckDiagFirstDevice( hypre_CSRMatrix *A ) { - HYPRE_Int row_i = hypre_cuda_get_grid_warp_id<1, 1>(); - - if (row_i >= nrows) - { - return; - } + if (hypre_CSRMatrixNumRows(A) != hypre_CSRMatrixNumCols(A)) + { + return 0; + } - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(hypre_CSRMatrixNumRows(A), "thread", bDim); - if (lane < 2) - { - p = read_only_load(ia + row_i + lane); - } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); + HYPRE_Int *result = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(A), HYPRE_MEMORY_DEVICE); + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRCheckDiagFirst, gDim, bDim, + hypre_CSRMatrixNumRows(A), + hypre_CSRMatrixI(A), hypre_CSRMatrixJ(A), result ); - HYPRE_Complex row_sum_i = 0.0; +#if defined(HYPRE_USING_CUDA) + HYPRE_Int ierr = HYPRE_THRUST_CALL( reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#elif defined(HYPRE_USING_SYCL) + HYPRE_Int ierr = HYPRE_ONEDPL_CALL( oneapi::dpl::reduce, + result, + result + hypre_CSRMatrixNumRows(A) ); +#endif - for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) - { - if ( j >= q || (CF_i && CF_j && read_only_load(&CF_i[row_i]) != read_only_load(&CF_j[ja[j]])) ) - { - continue; - } + hypre_TFree(result, HYPRE_MEMORY_DEVICE); - HYPRE_Complex aii = aa[j]; + hypre_SyncDeviceComputeStream(hypre_handle()); - if (type == 0) - { - row_sum_i += aii; - } - else if (type == 1) - { - row_sum_i += fabs(aii); - } - else if (type == 2) - { - row_sum_i += aii * aii; - } - } - - row_sum_i = warp_reduce_sum(row_sum_i); - - if (lane == 0) - { - if (set) - { - row_sum[row_i] = scal * row_sum_i; - } - else - { - row_sum[row_i] += scal * row_sum_i; - } - } + return ierr; } -void -hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, - HYPRE_Int *CF_i, - HYPRE_Int *CF_j, - HYPRE_Complex *row_sum, - HYPRE_Int type, - HYPRE_Complex scal, - const char *set_or_add) +HYPRE_Int +hypre_CSRMatrixMoveDiagFirstDevice( hypre_CSRMatrix *A ) { - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - dim3 bDim, gDim; + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - bDim = hypre_GetDefaultDeviceBlockDimension(); - gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - if (type == 0) - { - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, - row_sum, scal, set_or_add[0] == 's' ); - } - else if (type == 1) - { - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, - row_sum, scal, set_or_add[0] == 's' ); - } - else if (type == 2) - { - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, - row_sum, scal, set_or_add[0] == 's' ); - } + HYPRE_GPU_LAUNCH(hypreGPUKernel_CSRMoveDiagFirst, gDim, bDim, + nrows, A_i, A_j, A_data); + + hypre_SyncDeviceComputeStream(hypre_handle()); - hypre_SyncCudaComputeStream(hypre_handle()); + return hypre_error_flag; } -/* type 0: diag - * 1: abs diag - * 2: diag inverse - * 3: diag inverse sqrt - * 4: abs diag inverse sqrt - */ -__global__ void -hypreCUDAKernel_CSRExtractDiag( HYPRE_Int nrows, - HYPRE_Int *ia, - HYPRE_Int *ja, - HYPRE_Complex *aa, - HYPRE_Complex *d, - HYPRE_Int type) +/* markA: array of size nnz(A), for pattern of (A and B), markA is the column indices as in A_J + * Otherwise, mark pattern not in A-B as -1 in markA + * Note the special treatment for diagonal entries of A (marked as -2) */ +HYPRE_Int +hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A, + hypre_CSRMatrix *B, + HYPRE_Int *markA, + HYPRE_Int diag_opt) { - HYPRE_Int row = hypre_cuda_get_grid_warp_id<1, 1>(); + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Int nnzA = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int nnzB = hypre_CSRMatrixNumNonzeros(B); - if (row >= nrows) - { - return; - } + HYPRE_Int *Cii = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); + HYPRE_Int *Cjj = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); + HYPRE_Int *idx = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); - HYPRE_Int lane = hypre_cuda_get_lane_id<1>(); - HYPRE_Int p = 0, q = 0; + hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzA, hypre_CSRMatrixI(A), Cii); + hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA); + hypre_TMemcpy(Cjj, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - if (lane < 2) - { - p = read_only_load(ia + row + lane); - } - q = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 1); - p = __shfl_sync(HYPRE_WARP_FULL_MASK, p, 0); +#if defined(HYPRE_USING_CUDA) + HYPRE_THRUST_CALL( sequence, idx, idx + nnzA + nnzB ); + + HYPRE_THRUST_CALL( stable_sort_by_key, + thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)), + thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)) + nnzA + nnzB, + idx ); +#elif defined(HYPRE_USING_SYCL) + HYPRE_ONEDPL_CALL( dpct::iota, idx, idx + nnzA + nnzB, 0 ); + + auto zipped_begin = oneapi::dpl::make_zip_iterator(Cii, Cjj, idx); + HYPRE_ONEDPL_CALL( std::stable_sort, zipped_begin, zipped_begin + nnzA + nnzB, + [](auto lhs, auto rhs) { return std::get<0>(lhs) < std::get<0>(rhs); } ); +#endif - HYPRE_Int has_diag = 0; + hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - for (HYPRE_Int j = p + lane; __any_sync(HYPRE_WARP_FULL_MASK, j < q); j += HYPRE_WARP_SIZE) - { - hypre_int find_diag = j < q && ja[j] == row; + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim); - if (find_diag) - { - if (type == 0) - { - d[row] = aa[j]; - } - else if (type == 1) - { - d[row] = fabs(aa[j]); - } - else if (type == 2) - { - d[row] = 1.0 / aa[j]; - } - else if (type == 3) - { - d[row] = 1.0 / sqrt(aa[j]); - } - else if (type == 4) - { - d[row] = 1.0 / sqrt(fabs(aa[j])); - } - } + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRMatrixIntersectPattern, gDim, bDim, + nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt ); - if ( __any_sync(HYPRE_WARP_FULL_MASK, find_diag) ) - { - has_diag = 1; - break; - } - } + hypre_TFree(Cii, HYPRE_MEMORY_DEVICE); + hypre_TFree(Cjj, HYPRE_MEMORY_DEVICE); + hypre_TFree(idx, HYPRE_MEMORY_DEVICE); - if (!has_diag && lane == 0) - { - d[row] = 0.0; - } + return hypre_error_flag; } void @@ -1195,109 +1640,119 @@ hypre_CSRMatrixExtractDiagonalDevice( hypre_CSRMatrix *A, HYPRE_Complex *d, HYPRE_Int type) { - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - dim3 bDim, gDim; + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - bDim = hypre_GetDefaultDeviceBlockDimension(); - gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type ); + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRExtractDiag, gDim, bDim, nrows, A_i, A_j, A_data, d, type ); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); } /* return C = [A; B] */ hypre_CSRMatrix* hypre_CSRMatrixStack2Device(hypre_CSRMatrix *A, hypre_CSRMatrix *B) { - hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) ); - - hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B), - hypre_CSRMatrixNumCols(A), - hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) ); - - HYPRE_Int *C_i = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, - HYPRE_MEMORY_DEVICE); - HYPRE_Int *C_j = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumNonzeros(C), - HYPRE_MEMORY_DEVICE); - HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), - HYPRE_MEMORY_DEVICE); - - hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1, - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, - hypre_CSRMatrixNumRows(B), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL( transform, - C_i + hypre_CSRMatrixNumRows(A) + 1, - C_i + hypre_CSRMatrixNumRows(C) + 1, - thrust::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)), - C_i + hypre_CSRMatrixNumRows(A) + 1, - thrust::plus() ); - - hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, - hypre_CSRMatrixNumNonzeros(B), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - - hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, - hypre_CSRMatrixNumNonzeros(B), - HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); - - hypre_CSRMatrixI(C) = C_i; - hypre_CSRMatrixJ(C) = C_j; - hypre_CSRMatrixData(C) = C_a; - hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - - return C; + hypre_assert( hypre_CSRMatrixNumCols(A) == hypre_CSRMatrixNumCols(B) ); + + hypre_CSRMatrix *C = hypre_CSRMatrixCreate( hypre_CSRMatrixNumRows(A) + hypre_CSRMatrixNumRows(B), + hypre_CSRMatrixNumCols(A), + hypre_CSRMatrixNumNonzeros(A) + hypre_CSRMatrixNumNonzeros(B) ); + + HYPRE_Int *C_i = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE); + HYPRE_Int *C_j = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); + HYPRE_Complex *C_a = hypre_TAlloc(HYPRE_Complex, hypre_CSRMatrixNumNonzeros(C), HYPRE_MEMORY_DEVICE); + + hypre_TMemcpy(C_i, hypre_CSRMatrixI(A), HYPRE_Int, hypre_CSRMatrixNumRows(A) + 1, + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(C_i + hypre_CSRMatrixNumRows(A) + 1, hypre_CSRMatrixI(B) + 1, HYPRE_Int, hypre_CSRMatrixNumRows(B), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + +#ifdef HYPRE_USING_SYCL + HYPRE_Int *const_iterator = hypre_TAlloc(HYPRE_Int, hypre_CSRMatrixNumRows(C) + 1, HYPRE_MEMORY_DEVICE); + hypre_HandleComputeStream(hypre_handle())->fill(const_iterator, hypre_CSRMatrixNumNonzeros(A), (hypre_CSRMatrixNumRows(C) + 1)*sizeof(HYPRE_Int)).wait(); + + HYPRE_ONEDPL_CALL( std::transform, + C_i + hypre_CSRMatrixNumRows(A) + 1, + C_i + hypre_CSRMatrixNumRows(C) + 1, + const_iterator, //dpct::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)), + C_i + hypre_CSRMatrixNumRows(A) + 1, + std::plus() ); + + hypre_TFree(const_iterator, HYPRE_MEMORY_DEVICE); +#else + HYPRE_THRUST_CALL( transform, + C_i + hypre_CSRMatrixNumRows(A) + 1, + C_i + hypre_CSRMatrixNumRows(C) + 1, + thrust::make_constant_iterator(hypre_CSRMatrixNumNonzeros(A)), + C_i + hypre_CSRMatrixNumRows(A) + 1, + thrust::plus() ); +#endif + + hypre_TMemcpy(C_j, hypre_CSRMatrixJ(A), HYPRE_Int, hypre_CSRMatrixNumNonzeros(A), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(C_j + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixJ(B), HYPRE_Int, hypre_CSRMatrixNumNonzeros(B), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + hypre_TMemcpy(C_a, hypre_CSRMatrixData(A), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(A), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + hypre_TMemcpy(C_a + hypre_CSRMatrixNumNonzeros(A), hypre_CSRMatrixData(B), HYPRE_Complex, hypre_CSRMatrixNumNonzeros(B), + HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_DEVICE); + + hypre_CSRMatrixI(C) = C_i; + hypre_CSRMatrixJ(C) = C_j; + hypre_CSRMatrixData(C) = C_a; + hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; + + return C; } /* A = alp * I */ hypre_CSRMatrix * hypre_CSRMatrixIdentityDevice(HYPRE_Int n, HYPRE_Complex alp) { - hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n); - - hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE); + hypre_CSRMatrix *A = hypre_CSRMatrixCreate(n, n, n); - HYPRE_THRUST_CALL( sequence, - hypre_CSRMatrixI(A), - hypre_CSRMatrixI(A) + n + 1, - 0 ); + hypre_CSRMatrixInitialize_v2(A, 0, HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL( sequence, - hypre_CSRMatrixJ(A), - hypre_CSRMatrixJ(A) + n, - 0 ); +#ifdef HYPRE_USING_SYCL + HYPRE_ONEDPL_CALL( dpct::iota, + hypre_CSRMatrixI(A), + hypre_CSRMatrixI(A) + n + 1, + 0 ); - HYPRE_THRUST_CALL( fill, - hypre_CSRMatrixData(A), - hypre_CSRMatrixData(A) + n, - alp ); + HYPRE_ONEDPL_CALL( dpct::iota, + hypre_CSRMatrixJ(A), + hypre_CSRMatrixJ(A) + n, + 0 ); - return A; + HYPRE_ONEDPL_CALL( std::fill, + hypre_CSRMatrixData(A), + hypre_CSRMatrixData(A) + n, + alp ); +#else + HYPRE_THRUST_CALL( sequence, + hypre_CSRMatrixI(A), + hypre_CSRMatrixI(A) + n + 1, + 0 ); + + HYPRE_THRUST_CALL( sequence, + hypre_CSRMatrixJ(A), + hypre_CSRMatrixJ(A) + n, + 0 ); + + HYPRE_THRUST_CALL( fill, + hypre_CSRMatrixData(A), + hypre_CSRMatrixData(A) + n, + alp ); +#endif + return A; } -/* this predicate compares first and second element in a tuple in absolute value */ -/* first is assumed to be complex, second to be real > 0 */ -struct cabsfirst_greaterthan_second_pred : public - thrust::unary_function, bool> -{ - __host__ __device__ - bool operator()(const thrust::tuple& t) const - { - const HYPRE_Complex i = thrust::get<0>(t); - const HYPRE_Real j = thrust::get<1>(t); - - return hypre_cabs(i) > j; - } -}; /* drop the entries that are smaller than: * tol if elmt_tols == null, @@ -1307,248 +1762,210 @@ hypre_CSRMatrixDropSmallEntriesDevice( hypre_CSRMatrix *A, HYPRE_Real tol, HYPRE_Real *elmt_tols) { - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_ii = NULL; - HYPRE_Int new_nnz = 0; - HYPRE_Int *new_ii; - HYPRE_Int *new_j; - HYPRE_Complex *new_data; - - if (elmt_tols == NULL) - { - new_nnz = HYPRE_THRUST_CALL( count_if, - A_data, - A_data + nnz, - thrust::not1(less_than(tol)) ); - } - else - { - new_nnz = HYPRE_THRUST_CALL( count_if, - thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)), - thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)) + nnz, - cabsfirst_greaterthan_second_pred() ); - } - - if (new_nnz == nnz) - { - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - return hypre_error_flag; - } - - if (!A_ii) - { - A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); - } - new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); - new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); - - thrust::zip_iterator< thrust::tuple > new_end; - - if (elmt_tols == NULL) - { - new_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, - A_data, - thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), - thrust::not1(less_than(tol)) ); - } - else - { - new_end = HYPRE_THRUST_CALL( copy_if, - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), - thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, - thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)), - thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), - cabsfirst_greaterthan_second_pred() ); - } - - hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Int nnz = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_ii = NULL; + HYPRE_Int new_nnz = 0; + HYPRE_Int *new_ii; + HYPRE_Int *new_j; + HYPRE_Complex *new_data; + + if (elmt_tols == NULL) + { +#ifdef HYPRE_USING_SYCL + new_nnz = HYPRE_ONEDPL_CALL( std::count_if, + A_data, + A_data + nnz, + std::not_fn(less_than(tol)) ); +#else + new_nnz = HYPRE_THRUST_CALL( count_if, + A_data, + A_data + nnz, + thrust::not1(less_than(tol)) ); +#endif + } + else + { +#ifdef HYPRE_USING_SYCL + auto first = oneapi::dpl::make_zip_iterator(A_data, elmt_tols); + new_nnz = HYPRE_ONEDPL_CALL( std::count_if, + first, + first + nnz, + cabsfirst_greaterthan_second_pred() ); +#else + new_nnz = HYPRE_THRUST_CALL( count_if, + thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)), + thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)) + nnz, + cabsfirst_greaterthan_second_pred() ); +#endif + } + + if (new_nnz == nnz) + { + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + return hypre_error_flag; + } + + if (!A_ii) + { + A_ii = hypreDevice_CsrRowPtrsToIndices(nrows, nnz, A_i); + } + new_ii = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + new_j = hypre_TAlloc(HYPRE_Int, new_nnz, HYPRE_MEMORY_DEVICE); + new_data = hypre_TAlloc(HYPRE_Complex, new_nnz, HYPRE_MEMORY_DEVICE); + +#ifdef HYPRE_USING_SYCL + oneapi::dpl::zip_iterator< HYPRE_Int*, HYPRE_Int*, HYPRE_Complex* > new_end; + auto first = oneapi::dpl::make_zip_iterator(A_ii, A_j, A_data); + + if (elmt_tols == NULL) + { + new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, first + nnz, + A_data, + oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data), + std::not_fn(less_than(tol)) ); + } + else + { + new_end = HYPRE_ONEDPL_CALL( dpct::copy_if, + first, first + nnz, + oneapi::dpl::make_zip_iterator(A_data, elmt_tols), + oneapi::dpl::make_zip_iterator(new_ii, new_j, new_data), + cabsfirst_greaterthan_second_pred() ); + } + + // TODO: abb fix this + // hypre_assert( thrust::get<0>(*new_end) == new_ii + new_nnz ); +#else + thrust::zip_iterator< thrust::tuple > new_end; + + if (elmt_tols == NULL) + { + new_end = HYPRE_THRUST_CALL( copy_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, + A_data, + thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), + thrust::not1(less_than(tol)) ); + } + else + { + new_end = HYPRE_THRUST_CALL( copy_if, + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)), + thrust::make_zip_iterator(thrust::make_tuple(A_ii, A_j, A_data)) + nnz, + thrust::make_zip_iterator(thrust::make_tuple(A_data, elmt_tols)), + thrust::make_zip_iterator(thrust::make_tuple(new_ii, new_j, new_data)), + cabsfirst_greaterthan_second_pred() ); + } + + hypre_assert( thrust::get<0>(new_end.get_iterator_tuple()) == new_ii + new_nnz ); +#endif - hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); - hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_ii, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_i, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_j, HYPRE_MEMORY_DEVICE); + hypre_TFree(A_data, HYPRE_MEMORY_DEVICE); - hypre_CSRMatrixNumNonzeros(A) = new_nnz; - hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); - hypre_CSRMatrixJ(A) = new_j; - hypre_CSRMatrixData(A) = new_data; - hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); + hypre_CSRMatrixNumNonzeros(A) = new_nnz; + hypre_CSRMatrixI(A) = hypreDevice_CsrRowIndicesToPtrs(nrows, new_nnz, new_ii); + hypre_CSRMatrixJ(A) = new_j; + hypre_CSRMatrixData(A) = new_data; + hypre_TFree(new_ii, HYPRE_MEMORY_DEVICE); - return hypre_error_flag; + return hypre_error_flag; } -/* mark is of size nA - * diag_option: 1: special treatment for diag entries, mark as -2 - */ -__global__ void -hypreCUDAKernel_CSRMatrixIntersectPattern(HYPRE_Int n, - HYPRE_Int nA, - HYPRE_Int *rowid, - HYPRE_Int *colid, - HYPRE_Int *idx, - HYPRE_Int *mark, - HYPRE_Int diag_option) -{ - HYPRE_Int i = hypre_cuda_get_grid_thread_id<1, 1>(); - - if (i >= n) - { - return; - } - - HYPRE_Int r1 = read_only_load(&rowid[i]); - HYPRE_Int c1 = read_only_load(&colid[i]); - HYPRE_Int j = read_only_load(&idx[i]); - - if (0 == diag_option) - { - if (j < nA) - { - HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; - HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; - if (r1 == r2 && c1 == c2) - { - mark[j] = c1; - } - else - { - mark[j] = -1; - } - } - } - else if (1 == diag_option) - { - if (j < nA) - { - if (r1 == c1) - { - mark[j] = -2; - } - else - { - HYPRE_Int r2 = i < n - 1 ? read_only_load(&rowid[i + 1]) : -1; - HYPRE_Int c2 = i < n - 1 ? read_only_load(&colid[i + 1]) : -1; - if (r1 == r2 && c1 == c2) - { - mark[j] = c1; - } - else - { - mark[j] = -1; - } - } - } - } -} - -/* markA: array of size nnz(A), for pattern of (A and B), markA is the column indices as in A_J - * Otherwise, mark pattern not in A-B as -1 in markA - * Note the special treatment for diagonal entries of A (marked as -2) */ -HYPRE_Int -hypre_CSRMatrixIntersectPattern(hypre_CSRMatrix *A, - hypre_CSRMatrix *B, - HYPRE_Int *markA, - HYPRE_Int diag_opt) +void +hypre_CSRMatrixComputeRowSumDevice( hypre_CSRMatrix *A, + HYPRE_Int *CF_i, + HYPRE_Int *CF_j, + HYPRE_Complex *row_sum, + HYPRE_Int type, + HYPRE_Complex scal, + const char *set_or_add) { - HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); - HYPRE_Int nnzA = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Int nnzB = hypre_CSRMatrixNumNonzeros(B); - - HYPRE_Int *Cii = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); - HYPRE_Int *Cjj = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); - HYPRE_Int *idx = hypre_TAlloc(HYPRE_Int, nnzA + nnzB, HYPRE_MEMORY_DEVICE); - - hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzA, hypre_CSRMatrixI(A), Cii); - hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnzB, hypre_CSRMatrixI(B), Cii + nnzA); - hypre_TMemcpy(Cjj, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, - HYPRE_MEMORY_DEVICE); - hypre_TMemcpy(Cjj + nnzA, hypre_CSRMatrixJ(B), HYPRE_Int, nnzB, HYPRE_MEMORY_DEVICE, - HYPRE_MEMORY_DEVICE); - HYPRE_THRUST_CALL( sequence, idx, idx + nnzA + nnzB ); - - HYPRE_THRUST_CALL( stable_sort_by_key, - thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)), - thrust::make_zip_iterator(thrust::make_tuple(Cii, Cjj)) + nnzA + nnzB, - idx ); - - hypre_TMemcpy(markA, hypre_CSRMatrixJ(A), HYPRE_Int, nnzA, HYPRE_MEMORY_DEVICE, - HYPRE_MEMORY_DEVICE); - - dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); - dim3 gDim = hypre_GetDefaultDeviceGridDimension(nnzA + nnzB, "thread", bDim); - - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CSRMatrixIntersectPattern, gDim, bDim, - nnzA + nnzB, nnzA, Cii, Cjj, idx, markA, diag_opt ); - - hypre_TFree(Cii, HYPRE_MEMORY_DEVICE); - hypre_TFree(Cjj, HYPRE_MEMORY_DEVICE); - hypre_TFree(idx, HYPRE_MEMORY_DEVICE); - - return hypre_error_flag; + HYPRE_Int nrows = hypre_CSRMatrixNumRows(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + + dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); + dim3 gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim); + + if (type == 0) + { + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<0>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, + row_sum, scal, set_or_add[0] == 's' ); + } + else if (type == 1) + { + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<1>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, + row_sum, scal, set_or_add[0] == 's' ); + } + else if (type == 2) + { + HYPRE_GPU_LAUNCH( hypreGPUKernel_CSRRowSum<2>, gDim, bDim, nrows, A_i, A_j, A_data, CF_i, CF_j, + row_sum, scal, set_or_add[0] == 's' ); + } + + hypre_SyncDeviceComputeStream(hypre_handle()); } -#endif /* HYPRE_USING_CUDA || defined(HYPRE_USING_HIP) */ - -#if defined(HYPRE_USING_GPU) - HYPRE_Int hypre_CSRMatrixTransposeDevice(hypre_CSRMatrix *A, hypre_CSRMatrix **AT_ptr, HYPRE_Int data) { - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Int nrows_A = hypre_CSRMatrixNumRows(A); - HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); - HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); - HYPRE_Complex *C_data; - HYPRE_Int *C_i; - HYPRE_Int *C_j; - hypre_CSRMatrix *C; - - - /* trivial case */ - if (nnz_A == 0) - { - C_i = hypre_CTAlloc(HYPRE_Int, ncols_A + 1, HYPRE_MEMORY_DEVICE); - C_j = hypre_CTAlloc(HYPRE_Int, 0, HYPRE_MEMORY_DEVICE); - C_data = hypre_CTAlloc(HYPRE_Complex, 0, HYPRE_MEMORY_DEVICE); - } - else - { + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Int nrows_A = hypre_CSRMatrixNumRows(A); + HYPRE_Int ncols_A = hypre_CSRMatrixNumCols(A); + HYPRE_Int nnz_A = hypre_CSRMatrixNumNonzeros(A); + HYPRE_Complex *C_data; + HYPRE_Int *C_i; + HYPRE_Int *C_j; + hypre_CSRMatrix *C; + + + /* trivial case */ + if (nnz_A == 0) + { + C_i = hypre_CTAlloc(HYPRE_Int, ncols_A + 1, HYPRE_MEMORY_DEVICE); + C_j = hypre_CTAlloc(HYPRE_Int, 0, HYPRE_MEMORY_DEVICE); + C_data = hypre_CTAlloc(HYPRE_Complex, 0, HYPRE_MEMORY_DEVICE); + } + else + { #if defined(HYPRE_USING_CUSPARSE) - hypreDevice_CSRSpTransCusparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, - data); + hypreDevice_CSRSpTransCusparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, + data); #elif defined(HYPRE_USING_ROCSPARSE) - hypreDevice_CSRSpTransRocsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, - data); + hypreDevice_CSRSpTransRocsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, + data); #elif defined(HYPRE_USING_ONEMKLSPARSE) - hypreDevice_CSRSpTransOnemklsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, - data); + hypreDevice_CSRSpTransOnemklsparse(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, + data); #else - hypreDevice_CSRSpTrans(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data); + hypreDevice_CSRSpTrans(nrows_A, ncols_A, nnz_A, A_i, A_j, A_data, &C_i, &C_j, &C_data, data); #endif - } + } - C = hypre_CSRMatrixCreate(ncols_A, nrows_A, nnz_A); - hypre_CSRMatrixI(C) = C_i; - hypre_CSRMatrixJ(C) = C_j; - hypre_CSRMatrixData(C) = C_data; - hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; + C = hypre_CSRMatrixCreate(ncols_A, nrows_A, nnz_A); + hypre_CSRMatrixI(C) = C_i; + hypre_CSRMatrixJ(C) = C_j; + hypre_CSRMatrixData(C) = C_data; + hypre_CSRMatrixMemoryLocation(C) = HYPRE_MEMORY_DEVICE; - *AT_ptr = C; + *AT_ptr = C; - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); - return hypre_error_flag; + return hypre_error_flag; } #endif /* #if defined(HYPRE_USING_GPU) */ diff --git a/src/seq_mv/csr_matrix.c b/src/seq_mv/csr_matrix.c index 275625ec9f..98a26a942e 100644 --- a/src/seq_mv/csr_matrix.c +++ b/src/seq_mv/csr_matrix.c @@ -44,7 +44,7 @@ hypre_CSRMatrixCreate( HYPRE_Int num_rows, /* set defaults */ hypre_CSRMatrixOwnsData(matrix) = 1; -#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) +#if defined(HYPRE_USING_CUSPARSE) || defined(HYPRE_USING_ROCSPARSE) || defined(HYPRE_USING_ONEMKLSPARSE) hypre_CSRMatrixSortedJ(matrix) = NULL; hypre_CSRMatrixSortedData(matrix) = NULL; hypre_CSRMatrixCsrsvData(matrix) = NULL; diff --git a/src/seq_mv/csr_matvec_device.c b/src/seq_mv/csr_matvec_device.c index 811040a510..8b61018ccd 100644 --- a/src/seq_mv/csr_matvec_device.c +++ b/src/seq_mv/csr_matvec_device.c @@ -117,7 +117,7 @@ hypre_CSRMatrixMatvecDevice( HYPRE_Int trans, hypre_CSRMatrixMatvecDevice2(trans, alpha, A, x, beta, y, offset); } - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_GpuProfilingPopRange(); @@ -201,7 +201,7 @@ hypre_CSRMatrixMatvecCusparseNewAPI( HYPRE_Int trans, #endif dBuffer) ); - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); if (trans) { diff --git a/src/seq_mv/csr_spgemm_device.c b/src/seq_mv/csr_spgemm_device.c index 7d44c2cd05..b4074dadb9 100644 --- a/src/seq_mv/csr_spgemm_device.c +++ b/src/seq_mv/csr_spgemm_device.c @@ -89,7 +89,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, #endif hypreDevice_CSRSpGemmRownnz(m, k, n, d_ia, d_ja, d_ib, d_jb, 0 /* without input rc */, d_rc); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("Rownnz time %f\n", t2); #endif @@ -101,7 +101,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, 1 /* exact row nnz */, &d_ic, &d_jc, &d_c, &nnzC); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("SpGemmNumerical time %f\n", t2); #endif @@ -115,7 +115,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, #endif hypreDevice_CSRSpGemmRownnzEstimate(m, k, n, d_ia, d_ja, d_ib, d_jb, d_rc); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("RownnzEst time %f\n", t2); #endif @@ -126,7 +126,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, hypreDevice_CSRSpGemmNumerWithRownnzEstimate(m, k, n, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_rc, &d_ic, &d_jc, &d_c, &nnzC); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("SpGemmNumerical time %f\n", t2); #endif @@ -140,7 +140,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, #endif hypreDevice_CSRSpGemmRownnzEstimate(m, k, n, d_ia, d_ja, d_ib, d_jb, d_rc); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("RownnzEst time %f\n", t2); #endif @@ -157,7 +157,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, d_rc + 2 * m, thrust::identity() ); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("RownnzBound time %f\n", t2); #endif @@ -169,7 +169,7 @@ hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, rownnz_exact, &d_ic, &d_jc, &d_c, &nnzC); #ifdef HYPRE_SPGEMM_TIMING - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); t2 = hypre_MPI_Wtime() - t1; hypre_printf("SpGemmNumerical time %f\n", t2); #endif diff --git a/src/seq_mv/csr_spgemm_device_attempt.c b/src/seq_mv/csr_spgemm_device_attempt.c index 4e61662bdf..7a3fb9e4c6 100644 --- a/src/seq_mv/csr_spgemm_device_attempt.c +++ b/src/seq_mv/csr_spgemm_device_attempt.c @@ -506,7 +506,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int m, // for cases where one WARP works on a row dim3 gDim( (m + bDim.z - 1) / bDim.z ); - HYPRE_CUDA_LAUNCH ( (hypre_spgemm_attempt), + HYPRE_GPU_LAUNCH ( (hypre_spgemm_attempt), gDim, bDim, /* shmem_size, */ m, NULL, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_js, d_as, d_ghash1_i, d_ghash1_j, d_ghash1_a, d_rc, d_rf ); @@ -542,7 +542,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int m, // for cases where one WARP works on a row dim3 gDim( (num_failed_rows + bDim.z - 1) / bDim.z ); - HYPRE_CUDA_LAUNCH ( (hypre_spgemm_attempt), + HYPRE_GPU_LAUNCH ( (hypre_spgemm_attempt), gDim, bDim, /* shmem_size, */ num_failed_rows, rf_ind, d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_js, d_as, d_ghash2_i, d_ghash2_j, d_ghash2_a, @@ -563,7 +563,7 @@ hypre_spgemm_numerical_with_rowest( HYPRE_Int m, // for cases where one WARP works on a row dim3 gDim( (m + bDim.z - 1) / bDim.z ); - HYPRE_CUDA_LAUNCH( (hypre_spgemm_copy_from_hash_into_C), gDim, + HYPRE_GPU_LAUNCH( (hypre_spgemm_copy_from_hash_into_C), gDim, bDim, m, d_rf, d_js, d_as, diff --git a/src/seq_mv/csr_spgemm_device_confident.c b/src/seq_mv/csr_spgemm_device_confident.c index 452acd52fb..871f27f67d 100644 --- a/src/seq_mv/csr_spgemm_device_confident.c +++ b/src/seq_mv/csr_spgemm_device_confident.c @@ -467,7 +467,7 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int m, hypre_create_ija(m, d_rc, d_ic, &d_jc, &d_c, &nnzC_nume); - HYPRE_CUDA_LAUNCH ( (hypre_spgemm_numeric < num_warps_per_block, shmem_hash_size, !exact_rownnz, + HYPRE_GPU_LAUNCH ( (hypre_spgemm_numeric < num_warps_per_block, shmem_hash_size, !exact_rownnz, hash_type > ), gDim, bDim, /* shmem_size, */ m, /* k, n, */ d_ia, d_ja, d_a, d_ib, d_jb, d_b, d_ic, d_jc, d_c, d_rc, @@ -493,7 +493,7 @@ hypre_spgemm_numerical_with_rownnz( HYPRE_Int m, /* copy to the final C */ dim3 gDim( (m + bDim.z - 1) / bDim.z ); - HYPRE_CUDA_LAUNCH( (hypre_spgemm_copy_from_Cext_into_C), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_spgemm_copy_from_Cext_into_C), gDim, bDim, m, d_ic, d_jc, d_c, d_ic_new, d_jc_new, d_c_new ); hypre_TFree(d_ic, HYPRE_MEMORY_DEVICE); diff --git a/src/seq_mv/csr_spgemm_device_rowbound.c b/src/seq_mv/csr_spgemm_device_rowbound.c index 094b5a82e2..9697eb83f4 100644 --- a/src/seq_mv/csr_spgemm_device_rowbound.c +++ b/src/seq_mv/csr_spgemm_device_rowbound.c @@ -313,19 +313,19 @@ hypre_spgemm_rownnz_attempt(HYPRE_Int m, * ---------------------------------------------------------------------------*/ if (hash_type == 'L') { - HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic), + HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic), gDim, bDim, m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf ); } else if (hash_type == 'Q') { - HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic), + HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic), gDim, bDim, m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf ); } else if (hash_type == 'D') { - HYPRE_CUDA_LAUNCH( (hypre_spgemm_symbolic), + HYPRE_GPU_LAUNCH( (hypre_spgemm_symbolic), gDim, bDim, m, rf_ind, /*k, n,*/ d_ia, d_ja, d_ib, d_jb, d_ghash_i, d_ghash_j, d_rc, d_rf ); } diff --git a/src/seq_mv/csr_spgemm_device_rowest.c b/src/seq_mv/csr_spgemm_device_rowest.c index f8f65c216f..50f76b081e 100644 --- a/src/seq_mv/csr_spgemm_device_rowest.c +++ b/src/seq_mv/csr_spgemm_device_rowest.c @@ -287,11 +287,11 @@ void csr_spmm_rownnz_cohen(HYPRE_Int M, HYPRE_Int K, HYPRE_Int N, HYPRE_Int *d_i dim3 gDim( (nsamples * N + bDim.z * HYPRE_WARP_SIZE - 1) / (bDim.z * HYPRE_WARP_SIZE) ); - HYPRE_CUDA_LAUNCH( expdistfromuniform, gDim, bDim, nsamples * N, d_V1 ); + HYPRE_GPU_LAUNCH( expdistfromuniform, gDim, bDim, nsamples * N, d_V1 ); /* step-1: layer 3-2 */ gDim.x = (K + bDim.z - 1) / bDim.z; - HYPRE_CUDA_LAUNCH( (cohen_rowest_kernel), gDim, + HYPRE_GPU_LAUNCH( (cohen_rowest_kernel), gDim, bDim, K, d_ib, d_jb, d_V1, d_V2, NULL, nsamples, NULL, NULL, -1.0); @@ -301,7 +301,7 @@ void csr_spmm_rownnz_cohen(HYPRE_Int M, HYPRE_Int K, HYPRE_Int N, HYPRE_Int *d_i d_V3 = (T*) d_rc; gDim.x = (M + bDim.z - 1) / bDim.z; - HYPRE_CUDA_LAUNCH( (cohen_rowest_kernel), gDim, + HYPRE_GPU_LAUNCH( (cohen_rowest_kernel), gDim, bDim, M, d_ia, d_ja, d_V2, d_V3, d_rc, nsamples, d_low, d_upp, mult_factor); @@ -336,13 +336,13 @@ hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n, if (row_est_mtd == 1) { /* naive overestimate */ - HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'U', num_warps_per_block>), gDim, bDim, + HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'U', num_warps_per_block>), gDim, bDim, m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, NULL, d_rc ); } else if (row_est_mtd == 2) { /* naive underestimate */ - HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'L', num_warps_per_block>), gDim, bDim, + HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'L', num_warps_per_block>), gDim, bDim, m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, d_rc, NULL ); } else if (row_est_mtd == 3) @@ -361,7 +361,7 @@ hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n, HYPRE_Int *d_low = d_low_upp; HYPRE_Int *d_upp = d_low_upp + m; - HYPRE_CUDA_LAUNCH( (csr_spmm_rownnz_naive<'B', num_warps_per_block>), gDim, bDim, + HYPRE_GPU_LAUNCH( (csr_spmm_rownnz_naive<'B', num_warps_per_block>), gDim, bDim, m, /*k,*/ n, d_ia, d_ja, d_ib, d_jb, d_low, d_upp ); /* Cohen's algorithm, stochastic approach */ diff --git a/src/seq_mv/csr_spgemm_device_util.c b/src/seq_mv/csr_spgemm_device_util.c index fac7e8e5ef..8153d82819 100644 --- a/src/seq_mv/csr_spgemm_device_util.c +++ b/src/seq_mv/csr_spgemm_device_util.c @@ -103,14 +103,14 @@ hypre_SpGemmCreateGlobalHashTable( HYPRE_Int num_rows, /* number of { ghash_i = hypre_TAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE); dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_ghash, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_SpGemmGhashSize1, gDim, bDim, num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE ); } else if (type == 2) { ghash_i = hypre_CTAlloc(HYPRE_Int, num_ghash + 1, HYPRE_MEMORY_DEVICE); dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim, + HYPRE_GPU_LAUNCH( hypre_SpGemmGhashSize2, gDim, bDim, num_rows, row_id, num_ghash, row_sizes, ghash_i, SHMEM_HASH_SIZE ); } diff --git a/src/seq_mv/csr_spmv_device.c b/src/seq_mv/csr_spmv_device.c index d5d62d932a..1ae93fc279 100644 --- a/src/seq_mv/csr_spmv_device.c +++ b/src/seq_mv/csr_spmv_device.c @@ -170,7 +170,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows, const HYPRE_Int group_size = 32; const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size; const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block); - HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y ); } else if (rownnz >= 32) @@ -178,7 +178,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows, const HYPRE_Int group_size = 16; const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size; const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block); - HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y ); } else if (rownnz >= 16) @@ -186,7 +186,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows, const HYPRE_Int group_size = 8; const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size; const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block); - HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y ); } else if (rownnz >= 8) @@ -194,7 +194,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows, const HYPRE_Int group_size = 4; const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size; const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block); - HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y ); } else @@ -202,7 +202,7 @@ hypreDevice_CSRMatrixMatvec( HYPRE_Int nrows, const HYPRE_Int group_size = 4; const HYPRE_Int num_groups_per_block = SPMV_BLOCKDIM / group_size; const dim3 gDim((nrows + num_groups_per_block - 1) / num_groups_per_block); - HYPRE_CUDA_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, + HYPRE_GPU_LAUNCH( (hypre_csr_v_k_shuffle), gDim, bDim, nrows, alpha, d_ia, d_ja, d_a, d_x, beta, d_y ); } diff --git a/src/seq_mv/csr_sptrans_device.c b/src/seq_mv/csr_sptrans_device.c index 548665ed2e..bd85778a03 100644 --- a/src/seq_mv/csr_sptrans_device.c +++ b/src/seq_mv/csr_sptrans_device.c @@ -137,7 +137,7 @@ hypreDevice_CSRSpTransRocsparse(HYPRE_Int m, HYPRE_Int n, HYPR *d_ac_out = csc_a; #ifdef HYPRE_PROFILE - hypre_SyncCudaDevice(hypre_handle()) + hypre_SyncDevice(hypre_handle()) hypre_profile_times[HYPRE_TIMER_ID_SPTRANS] += hypre_MPI_Wtime(); #endif diff --git a/src/seq_mv/protos.h b/src/seq_mv/protos.h index 9081b58c20..4d7b494ad9 100644 --- a/src/seq_mv/protos.h +++ b/src/seq_mv/protos.h @@ -281,6 +281,8 @@ HYPRE_Int hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data); +HYPRE_Int hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data); + HYPRE_Int hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, hypre_CSRMatrix *B, hypre_CSRMatrix **C_ptr); HYPRE_Int hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n, diff --git a/src/seq_mv/seq_mv.h b/src/seq_mv/seq_mv.h index de34685237..485d045d08 100644 --- a/src/seq_mv/seq_mv.h +++ b/src/seq_mv/seq_mv.h @@ -553,6 +553,8 @@ HYPRE_Int hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data); +HYPRE_Int hypreDevice_CSRSpTransOnemklsparse(HYPRE_Int m, HYPRE_Int n, HYPRE_Int nnzA, HYPRE_Int *d_ia, HYPRE_Int *d_ja, HYPRE_Complex *d_aa, HYPRE_Int **d_ic_out, HYPRE_Int **d_jc_out, HYPRE_Complex **d_ac_out, HYPRE_Int want_data); + HYPRE_Int hypreDevice_CSRSpGemm(hypre_CSRMatrix *A, hypre_CSRMatrix *B, hypre_CSRMatrix **C_ptr); HYPRE_Int hypreDevice_CSRSpGemmRownnzEstimate(HYPRE_Int m, HYPRE_Int k, HYPRE_Int n, diff --git a/src/seq_mv/vector.c b/src/seq_mv/vector.c index 8b024f39c5..bfab868fbb 100644 --- a/src/seq_mv/vector.c +++ b/src/seq_mv/vector.c @@ -300,7 +300,7 @@ hypre_SeqVectorSetConstantValues( hypre_Vector *v, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -488,7 +488,7 @@ hypre_SeqVectorScale( HYPRE_Complex alpha, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -542,7 +542,7 @@ hypre_SeqVectorAxpy( HYPRE_Complex alpha, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -596,7 +596,7 @@ hypre_SeqVectorElmdivpy( hypre_Vector *x, } #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -647,7 +647,7 @@ hypre_SeqVectorElmdivpyMarked( hypre_Vector *x, } #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -704,7 +704,7 @@ hypre_SeqVectorInnerProd( hypre_Vector *x, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif #ifdef HYPRE_PROFILE @@ -806,7 +806,7 @@ hypre_SeqVectorMax( HYPRE_Complex alpha, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #ifdef HYPRE_PROFILE hypre_profile_times[HYPRE_TIMER_ID_BLAS1] += hypre_MPI_Wtime(); diff --git a/src/sstruct_mv/sstruct_matrix.c b/src/sstruct_mv/sstruct_matrix.c index 1d9ce85366..e51066abcc 100644 --- a/src/sstruct_mv/sstruct_matrix.c +++ b/src/sstruct_mv/sstruct_matrix.c @@ -392,7 +392,7 @@ hypre_SStructPMatrixSetBoxValues( hypre_SStructPMatrix *pmatrix, values, action, -1, 0); /* TODO: Why need DeviceSync? */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif /* set (AddTo/Get) or clear (Set) values outside the grid in ghost zones */ if (action != 0) diff --git a/src/sstruct_mv/sstruct_vector.c b/src/sstruct_mv/sstruct_vector.c index fdeeae6421..fa8db02a35 100644 --- a/src/sstruct_mv/sstruct_vector.c +++ b/src/sstruct_mv/sstruct_vector.c @@ -247,7 +247,7 @@ hypre_SStructPVectorSetBoxValues( hypre_SStructPVector *pvector, /* TODO: Why need DeviceSync? */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif /* set (AddTo/Get) or clear (Set) values outside the grid in ghost zones */ if (action != 0) diff --git a/src/struct_mv/_hypre_struct_mv.hpp b/src/struct_mv/_hypre_struct_mv.hpp index a3845755ef..c30e3398ae 100644 --- a/src/struct_mv/_hypre_struct_mv.hpp +++ b/src/struct_mv/_hypre_struct_mv.hpp @@ -800,7 +800,7 @@ extern "C++" const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); - HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); + HYPRE_GPU_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); } } @@ -861,7 +861,7 @@ extern "C++" hypre_printf("length= %d, blocksize = %d, gridsize = %d\n", length, bDim.x, gDim.x); */ - HYPRE_CUDA_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body ); + HYPRE_GPU_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body ); } } diff --git a/src/struct_mv/boxloop_cuda.h b/src/struct_mv/boxloop_cuda.h index ef36562ef5..d453864f8b 100644 --- a/src/struct_mv/boxloop_cuda.h +++ b/src/struct_mv/boxloop_cuda.h @@ -74,7 +74,7 @@ extern "C++" const dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); const dim3 gDim = hypre_GetDefaultDeviceGridDimension(length, "thread", bDim); - HYPRE_CUDA_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); + HYPRE_GPU_LAUNCH( forall_kernel, gDim, bDim, loop_body, length ); } } @@ -135,7 +135,7 @@ extern "C++" hypre_printf("length= %d, blocksize = %d, gridsize = %d\n", length, bDim.x, gDim.x); */ - HYPRE_CUDA_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body ); + HYPRE_GPU_LAUNCH( reductionforall_kernel, gDim, bDim, length, reducer, loop_body ); } } diff --git a/src/test/ij.c b/src/test/ij.c index 26640554c7..a3dcfc76b3 100644 --- a/src/test/ij.c +++ b/src/test/ij.c @@ -3406,7 +3406,7 @@ main( hypre_int argc, } #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif hypre_EndTiming(time_index); @@ -3766,7 +3766,7 @@ main( hypre_int argc, #endif #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif hypre_EndTiming(time_index); @@ -3804,7 +3804,7 @@ main( hypre_int argc, #endif #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif hypre_EndTiming(time_index); @@ -3865,7 +3865,7 @@ main( hypre_int argc, #endif #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif tt = hypre_MPI_Wtime() - tt; @@ -3897,7 +3897,7 @@ main( hypre_int argc, #endif #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif tt = hypre_MPI_Wtime() - tt; diff --git a/src/test/ij_assembly.c b/src/test/ij_assembly.c index bb17d32803..fb28c9ba55 100644 --- a/src/test/ij_assembly.c +++ b/src/test/ij_assembly.c @@ -678,7 +678,7 @@ test_Set(MPI_Comm comm, chunk_size = nrows / nchunks; #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStart(); #endif @@ -707,7 +707,7 @@ test_Set(MPI_Comm comm, HYPRE_IJMatrixAssemble(ij_A); #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStop(); #endif @@ -831,7 +831,7 @@ test_SetOffProc(HYPRE_ParCSRMatrix parcsr_A, chunk_size = nrows / nchunks; #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif time_index = hypre_InitializeTiming("Test SetValues OffProc"); @@ -862,7 +862,7 @@ test_SetOffProc(HYPRE_ParCSRMatrix parcsr_A, //cudaProfilerStop(); #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #endif hypre_EndTiming(time_index); @@ -945,7 +945,7 @@ test_SetSet(MPI_Comm comm, #endif #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStart(); #endif @@ -996,7 +996,7 @@ test_SetSet(MPI_Comm comm, HYPRE_IJMatrixAssemble(ij_A); #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStop(); #endif @@ -1072,7 +1072,7 @@ test_AddSet(MPI_Comm comm, #endif #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStart(); #endif @@ -1120,7 +1120,7 @@ test_AddSet(MPI_Comm comm, HYPRE_IJMatrixAssemble(ij_A); #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStop(); #endif @@ -1178,7 +1178,7 @@ test_SetAddSet(MPI_Comm comm, chunk_size = nrows / nchunks; #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStart(); #endif @@ -1244,7 +1244,7 @@ test_SetAddSet(MPI_Comm comm, HYPRE_IJMatrixAssemble(ij_A); #if defined(HYPRE_USING_GPU) - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); #if defined(CUDA_PROFILER) cudaProfilerStop(); #endif diff --git a/src/test/ij_mm.c b/src/test/ij_mm.c index 4bbf24fc39..807e9b1630 100644 --- a/src/test/ij_mm.c +++ b/src/test/ij_mm.c @@ -161,7 +161,7 @@ void runjob1( HYPRE_ParCSRMatrix parcsr_A, if (i == rep - 1) { - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); //cudaProfilerStop(); hypre_EndTiming(time_index); hypre_PrintTiming("Device Parcsr Matrix-by-Matrix, A*A", hypre_MPI_COMM_WORLD); @@ -350,7 +350,7 @@ void runjob2( HYPRE_ParCSRMatrix parcsr_A, if (i == 1) { - hypre_SyncCudaDevice(hypre_handle()); + hypre_SyncDevice(hypre_handle()); //cudaProfilerStop(); hypre_EndTiming(time_index); hypre_PrintTiming("Device Parcsr Matrix-by-Matrix, RAP2", hypre_MPI_COMM_WORLD); @@ -452,7 +452,7 @@ main( hypre_int argc, HYPRE_Init(); /* for timing, sync after kernels */ - hypre_SetSyncCudaCompute(1); + hypre_SetSyncDeviceCompute(1); #if defined(HYPRE_USING_CUDA) hypre_HandleDefaultExecPolicy(hypre_handle()) = HYPRE_EXEC_DEVICE; diff --git a/src/test/zboxloop.c b/src/test/zboxloop.c index f836aba02e..592ab6a158 100644 --- a/src/test/zboxloop.c +++ b/src/test/zboxloop.c @@ -20,8 +20,6 @@ * Test driver to time new boxloops and compare to the old ones *--------------------------------------------------------------------------*/ -#define DEVICE_VAR - hypre_int main( hypre_int argc, char *argv[] ) @@ -39,6 +37,7 @@ main( hypre_int argc, //HYPRE_Int xi1, xi2, xi3, xi4; HYPRE_Int xi1; HYPRE_Real *xp1, *xp2, *xp3, *xp4; + HYPRE_Real *d_xp1, *d_xp2, *d_xp3, *d_xp4; hypre_Index loop_size, start, unit_stride, index; /*----------------------------------------------------------- @@ -51,6 +50,8 @@ main( hypre_int argc, hypre_MPI_Comm_size(hypre_MPI_COMM_WORLD, &num_procs ); hypre_MPI_Comm_rank(hypre_MPI_COMM_WORLD, &myid ); + HYPRE_Init(); + /*----------------------------------------------------------- * Set defaults *-----------------------------------------------------------*/ @@ -65,6 +66,8 @@ main( hypre_int argc, Q = 1; R = 1; + reps = -1; + /*----------------------------------------------------------- * Parse command line *-----------------------------------------------------------*/ @@ -92,6 +95,11 @@ main( hypre_int argc, arg_index++; dim = atoi(argv[arg_index++]); } + else if ( strcmp(argv[arg_index], "-reps") == 0 ) + { + arg_index++; + reps = atoi(argv[arg_index++]); + } else if ( strcmp(argv[arg_index], "-help") == 0 ) { print_usage = 1; @@ -230,7 +238,7 @@ main( hypre_int argc, hypre_MPI_Barrier(hypre_MPI_COMM_WORLD); /*----------------------------------------------------------- - * Time old boxloops + * Time old boxloops [Device] *-----------------------------------------------------------*/ /* Time BoxLoop0 */ @@ -239,12 +247,14 @@ main( hypre_int argc, for (rep = 0; rep < reps; rep++) { xi1 = 0; +#define DEVICE_VAR is_device_ptr(d_xp1) hypre_BoxLoop0Begin(3, loop_size); { - xp1[xi1] += xp1[xi1]; + d_xp1[xi1] += d_xp1[xi1]; //xi1++; } hypre_BoxLoop0End(); +#undef DEVICE_VAR } hypre_EndTiming(time_index); @@ -253,12 +263,14 @@ main( hypre_int argc, hypre_BeginTiming(time_index); for (rep = 0; rep < reps; rep++) { +#define DEVICE_VAR is_device_ptr(d_xp1) hypre_BoxLoop1Begin(3, loop_size, x1_data_box, start, unit_stride, xi1); { - xp1[xi1] += xp1[xi1]; + d_xp1[xi1] += d_xp1[xi1]; } hypre_BoxLoop1End(xi1); +#undef DEVICE_VAR } hypre_EndTiming(time_index); @@ -267,13 +279,15 @@ main( hypre_int argc, hypre_BeginTiming(time_index); for (rep = 0; rep < reps; rep++) { +#define DEVICE_VAR is_device_ptr(d_xp1,d_xp2) hypre_BoxLoop2Begin(3, loop_size, x1_data_box, start, unit_stride, xi1, x2_data_box, start, unit_stride, xi2); { - xp1[xi1] += xp1[xi1] + xp2[xi2]; + d_xp1[xi1] += d_xp1[xi1] + d_xp2[xi2]; } hypre_BoxLoop2End(xi1, xi2); +#undef DEVICE_VAR } hypre_EndTiming(time_index); @@ -282,14 +296,16 @@ main( hypre_int argc, hypre_BeginTiming(time_index); for (rep = 0; rep < reps; rep++) { +#define DEVICE_VAR is_device_ptr(d_xp1,d_xp2,d_xp3) hypre_BoxLoop3Begin(3, loop_size, x1_data_box, start, unit_stride, xi1, x2_data_box, start, unit_stride, xi2, x3_data_box, start, unit_stride, xi3); { - xp1[xi1] += xp1[xi1] + xp2[xi2] + xp3[xi3]; + d_xp1[xi1] += d_xp1[xi1] + d_xp2[xi2] + d_xp3[xi3]; } hypre_BoxLoop3End(xi1, xi2, xi3); +#undef DEVICE_VAR } hypre_EndTiming(time_index); @@ -298,24 +314,26 @@ main( hypre_int argc, hypre_BeginTiming(time_index); for (rep = 0; rep < reps; rep++) { +#define DEVICE_VAR is_device_ptr(d_xp1,d_xp2,d_xp3,d_xp4) hypre_BoxLoop4Begin(3, loop_size, x1_data_box, start, unit_stride, xi1, x2_data_box, start, unit_stride, xi2, x3_data_box, start, unit_stride, xi3, x4_data_box, start, unit_stride, xi4); { - xp1[xi1] += xp1[xi1] + xp2[xi2] + xp3[xi3] + xp4[xi4]; + d_xp1[xi1] += d_xp1[xi1] + d_xp2[xi2] + d_xp3[xi3] + d_xp4[xi4]; } hypre_BoxLoop4End(xi1, xi2, xi3, xi4); +#undef DEVICE_VAR } hypre_EndTiming(time_index); - hypre_PrintTiming("Old BoxLoop times", hypre_MPI_COMM_WORLD); + hypre_PrintTiming("Old BoxLoop times [DEVICE]", hypre_MPI_COMM_WORLD); hypre_FinalizeTiming(time_index); hypre_ClearTiming(); /*----------------------------------------------------------- - * Time new boxloops + * Time new boxloops [Host] *-----------------------------------------------------------*/ /* Time BoxLoop0 */ @@ -415,7 +433,7 @@ main( hypre_int argc, } hypre_EndTiming(time_index); - hypre_PrintTiming("New BoxLoop times", hypre_MPI_COMM_WORLD); + hypre_PrintTiming("New BoxLoop times [HOST]", hypre_MPI_COMM_WORLD); hypre_FinalizeTiming(time_index); hypre_ClearTiming(); @@ -427,11 +445,19 @@ main( hypre_int argc, hypre_BoxDestroy(x2_data_box); hypre_BoxDestroy(x3_data_box); hypre_BoxDestroy(x4_data_box); + hypre_TFree(xp1, HYPRE_MEMORY_HOST); hypre_TFree(xp2, HYPRE_MEMORY_HOST); hypre_TFree(xp3, HYPRE_MEMORY_HOST); hypre_TFree(xp4, HYPRE_MEMORY_HOST); + hypre_TFree(d_xp1, HYPRE_MEMORY_DEVICE); + hypre_TFree(d_xp2, HYPRE_MEMORY_DEVICE); + hypre_TFree(d_xp3, HYPRE_MEMORY_DEVICE); + hypre_TFree(d_xp4, HYPRE_MEMORY_DEVICE); + + HYPRE_Finalize(); + /* Finalize MPI */ hypre_MPI_Finalize(); diff --git a/src/utilities/HYPRE_utilities.h b/src/utilities/HYPRE_utilities.h index 5dc0ff6a10..6ac7ccd255 100644 --- a/src/utilities/HYPRE_utilities.h +++ b/src/utilities/HYPRE_utilities.h @@ -83,7 +83,15 @@ typedef double HYPRE_Real; #endif #if defined(HYPRE_COMPLEX) -typedef double _Complex HYPRE_Complex; + +#if defined(HYPRE_USING_SYCL) + typedef std::complex HYPRE_Complex; +#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + typedef thrust::complex HYPRE_Complex; +#else + typedef double _Complex HYPRE_Complex; +#endif + #define HYPRE_MPI_COMPLEX MPI_C_DOUBLE_COMPLEX /* or MPI_LONG_DOUBLE ? */ #else /* default */ @@ -177,11 +185,15 @@ HYPRE_Int HYPRE_AssumedPartitionCheck(); * HYPRE memory location *--------------------------------------------------------------------------*/ +// ABB: HYPRE_MEMORY_UNIFIED for the case of allocating SHARED memory +// specifically at some locations and everywhere as can be enabled +// with HYPRE_USING_UNIFIED_MEMORY macro typedef enum _HYPRE_MemoryLocation { HYPRE_MEMORY_UNDEFINED = -1, HYPRE_MEMORY_HOST, - HYPRE_MEMORY_DEVICE + HYPRE_MEMORY_DEVICE, + HYPRE_MEMORY_UNIFIED } HYPRE_MemoryLocation; HYPRE_Int HYPRE_SetMemoryLocation(HYPRE_MemoryLocation memory_location); diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index 0df44e6bea..d26bf1927b 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -635,6 +635,11 @@ hypre_GetActualMemLocation(HYPRE_MemoryLocation location) #endif } + if (location == HYPRE_MEMORY_UNIFIED) + { + return hypre_MEMORY_UNIFIED; + } + return hypre_MEMORY_UNDEFINED; } @@ -1740,8 +1745,8 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR hypre_UnorderedBigIntMap *inverse_map); #if defined(HYPRE_USING_GPU) -HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle); -HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SyncDevice(hypre_Handle *hypre_handle); HYPRE_Int hypre_ResetCudaDevice(hypre_Handle *hypre_handle); HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y); @@ -1772,10 +1777,10 @@ HYPRE_Int hypre_multmod(HYPRE_Int a, HYPRE_Int b, HYPRE_Int mod); void hypre_partition1D(HYPRE_Int n, HYPRE_Int p, HYPRE_Int j, HYPRE_Int *s, HYPRE_Int *e); char *hypre_strcpy(char *destination, const char *source); -HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action); -HYPRE_Int hypre_RestoreSyncCudaCompute(); -HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr); -HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SetSyncDeviceCompute(HYPRE_Int action); +HYPRE_Int hypre_RestoreSyncDeviceCompute(); +HYPRE_Int hypre_GetSyncDeviceCompute(HYPRE_Int *device_compute_stream_sync_ptr); +HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle); /* handle.c */ HYPRE_Int hypre_SetSpGemmUseCusparse( HYPRE_Int use_cusparse ); diff --git a/src/utilities/_hypre_utilities.hpp b/src/utilities/_hypre_utilities.hpp index 9bdf2d06a7..1a6f392e84 100644 --- a/src/utilities/_hypre_utilities.hpp +++ b/src/utilities/_hypre_utilities.hpp @@ -110,6 +110,11 @@ struct hypre_device_allocator #elif defined(HYPRE_USING_SYCL) +typedef sycl::range<1> dim3; +#define __global__ +#define __host__ +#define __device__ + /* WM: problems with this being inside extern C++ {} */ /* #include */ @@ -392,17 +397,39 @@ struct hypre_GpuMatData #define hypre_GpuMatDataMatInfo(data) ((data) -> mat_info) #define hypre_GpuMatDataSpMVBuffer(data) ((data) -> spmv_buffer) +/* device_utils.c, some common functions for CUDA, SYCL, HIP */ + +dim3 hypre_GetDefaultDeviceBlockDimension(); + +dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, + dim3 bDim ); + +HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, + HYPRE_Int *d_row_ind); + +HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); + +HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); + +HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, + HYPRE_Int *d_row_ptr); + +HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i); + +HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); + +HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); + +template +HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, + HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind); + #endif //#if defined(HYPRE_USING_GPU) #if defined(HYPRE_USING_SYCL) /* device_utils.c */ HYPRE_Int HYPRE_SetSYCLDevice(sycl::device user_device); -sycl::range<1> hypre_GetDefaultDeviceBlockDimension(); - -sycl::range<1> hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, - sycl::range<1> bDim ); - #endif // #if defined(HYPRE_USING_SYCL) #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) @@ -443,15 +470,15 @@ using namespace thrust::placeholders; #if defined(HYPRE_DEBUG) #if defined(HYPRE_USING_CUDA) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } #elif defined(HYPRE_USING_HIP) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } #endif #else // #if defined(HYPRE_DEBUG) #define GPU_LAUNCH_SYNC #endif // defined(HYPRE_DEBUG) -#define HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \ +#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \ { \ if ( gridsize.x == 0 || gridsize.y == 0 || gridsize.z == 0 || \ blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 ) \ @@ -467,7 +494,7 @@ using namespace thrust::placeholders; } \ } -#define HYPRE_CUDA_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__) +#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__) /* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right * The following one works OK for now */ @@ -1002,10 +1029,6 @@ hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *val template HYPRE_Int hypreDevice_ReduceByTupleKey(HYPRE_Int N, T1 *keys1_in, T2 *keys2_in, T3 *vals_in, T1 *keys1_out, T2 *keys2_out, T3 *vals_out); -template -HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, - HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind); - template HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v); @@ -1017,22 +1040,6 @@ HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab); -HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i); - -HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); - -HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); - -HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); - -HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, - HYPRE_Int *d_row_ind); - -HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); - -HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, - HYPRE_Int *d_row_ptr); - HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y, char *work); @@ -1058,6 +1065,482 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +//////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(HYPRE_USING_SYCL) + +#pragma once + +#include +#include +#include +#include + +#include // dpct::remove_if, remove_copy_if, copy_if, scatter_if + +#include +#include +#include +#include + +#define __forceinline__ __inline__ __attribute__((always_inline)) + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * macro for launching SYCL kernels, SYCL, oneDPL, oneMKL calls + * NOTE: IN HYPRE'S DEFAULT STREAM + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + */ + +template +OutputIter hypreSycl_gather(InputIter1 map_first, InputIter1 map_last, + InputIter2 input_first, OutputIter result) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto perm_begin = + oneapi::dpl::make_permutation_iterator(input_first, map_first); + const int n = ::std::distance(map_first, map_last); + + return oneapi::dpl::copy(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())), + perm_begin, perm_begin + n, result); +} + +#if defined(HYPRE_DEBUG) +#if defined(HYPRE_USING_CUDA) +#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } +#endif +#else // #if defined(HYPRE_DEBUG) +#define GPU_LAUNCH_SYNC +#endif // defined(HYPRE_DEBUG) + +#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) \ +{ \ + if ( gridsize[0] == 0 || blocksize[0] == 0 ) \ + { \ + hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \ + __FILE__, __LINE__, \ + gridsize[0], blocksize[0]); \ + assert(0); exit(1); \ + } \ + else \ + { \ + hypre_HandleComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \ + [=] (sycl::nd_item<1> item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]] { \ + (kernel_name)(item, __VA_ARGS__); \ + }); \ + } \ +} + +/* RL: TODO Want macro HYPRE_ONEDPL_CALL to return value but I don't know how to do it right + * The following one works OK for now */ + +#define HYPRE_ONEDPL_CALL(func_name, ...) \ + func_name(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); + +// /* return the number of threads in block */ +// template +// static __forceinline__ +// hypre_int hypre_gpu_get_num_threads() +// { +// switch (dim) +// { +// case 1: +// return (blockDim.x); +// case 2: +// return (blockDim.x * blockDim.y); +// case 3: +// return (blockDim.x * blockDim.y * blockDim.z); +// } + +// return -1; +// } + +/* return the number of (sub_groups) warps in (work-group) block */ +template +static __forceinline__ +hypre_int hypre_gpu_get_num_warps(sycl::nd_item& item) +{ + return item.get_sub_group().get_group_range().get(0); +} + +/* return the thread lane id in warp */ +template +static __forceinline__ +hypre_int hypre_gpu_get_lane_id(sycl::nd_item& item) +{ + return item.get_sub_group().get_local_linear_id(); +} + +// /* return the number of threads in grid */ +// template +// static __forceinline__ +// hypre_int hypre_gpu_get_grid_num_threads() +// { +// return hypre_gpu_get_num_blocks() * hypre_gpu_get_num_threads(); +// } + +/* return the flattened work-item/thread id in global work space, + * Note: Since the use-cases always involved bdim = gdim = 1, the + * sycl:;nd_item<1> is only being used. SFINAE is used to prevent + * other dimensions (i.e., bdim != gdim != 1) */ +template < hypre_int bdim, hypre_int gdim > +static __forceinline__ +hypre_int hypre_gpu_get_grid_thread_id(sycl::nd_item<1>& item) +{ + static_assert(bdim == 1 && gdim == 1); + return item.get_global_id(0); +} + +// /* return the number of warps in grid */ +// template +// static __forceinline__ +// hypre_int hypre_gpu_get_grid_num_warps() +// { +// return hypre_gpu_get_num_blocks() * hypre_gpu_get_num_warps(); +// } + +/* return the flattened warp id in grid */ +template +static __forceinline__ +hypre_int hypre_gpu_get_grid_warp_id(sycl::nd_item<1>& item) +{ + return item.get_group(0) * hypre_gpu_get_num_warps(item) + + item.get_sub_group().get_group_linear_id(); +} + +// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +// static __forceinline__ +// hypre_double atomicAdd(hypre_double* address, hypre_double val) +// { +// hypre_ulonglongint* address_as_ull = (hypre_ulonglongint*) address; +// hypre_ulonglongint old = *address_as_ull, assumed; + +// do { +// assumed = old; +// old = atomicCAS(address_as_ull, assumed, +// __double_as_longlong(val + +// __longlong_as_double(assumed))); + +// // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) +// } while (assumed != old); + +// return __longlong_as_double(old); +// } +// #endif + +template +static __forceinline__ +T read_only_load( const T *ptr ) +{ + return *ptr; +} + +// /* exclusive prefix scan */ +// template +// static __forceinline__ +// T warp_prefix_sum(hypre_int lane_id, T in, T &all_sum) +// { +// #pragma unroll +// for (hypre_int d = 2; d <=HYPRE_WARP_SIZE; d <<= 1) +// { +// T t = __shfl_up_sync(HYPRE_WARP_FULL_MASK, in, d >> 1); +// if ( (lane_id & (d - 1)) == (d - 1) ) +// { +// in += t; +// } +// } + +// all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, HYPRE_WARP_SIZE-1); + +// if (lane_id == HYPRE_WARP_SIZE-1) +// { +// in = 0; +// } + +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// T t = __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d); + +// if ( (lane_id & (d - 1)) == (d - 1)) +// { +// if ( (lane_id & ((d << 1) - 1)) == ((d << 1) - 1) ) +// { +// in += t; +// } +// else +// { +// in = t; +// } +// } +// } +// return in; +// } + +template +static __forceinline__ +T warp_reduce_sum(T in, sycl::nd_item& item) +{ + sycl::sub_group SG = item.get_sub_group(); + //sycl::ext::oneapi::reduce(SG, in, std::plus()); +#pragma unroll + for (hypre_int d = SG.get_local_range().get(0)/2; d > 0; d >>= 1) + { + in += SG.shuffle_down(in, d); + } + return in; +} + +// template +// static __forceinline__ +// T warp_allreduce_sum(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in += __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_reduce_max(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = max(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_allreduce_max(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = max(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_reduce_min(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = min(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_allreduce_min(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = min(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// static __forceinline__ +// hypre_int next_power_of_2(hypre_int n) +// { +// if (n <= 0) +// { +// return 0; +// } + +// /* if n is power of 2, return itself */ +// if ( (n & (n - 1)) == 0 ) +// { +// return n; +// } + +// n |= (n >> 1); +// n |= (n >> 2); +// n |= (n >> 4); +// n |= (n >> 8); +// n |= (n >> 16); +// n ^= (n >> 1); +// n = (n << 1); + +// return n; +// } + +// template +// struct absolute_value : public thrust::unary_function +// { +// T operator()(const T &x) const +// { +// return x < T(0) ? -x : x; +// } +// }; + +// template +// struct TupleComp2 +// { +// typedef thrust::tuple Tuple; + +// bool operator()(const Tuple& t1, const Tuple& t2) +// { +// if (thrust::get<0>(t1) < thrust::get<0>(t2)) +// { +// return true; +// } +// if (thrust::get<0>(t1) > thrust::get<0>(t2)) +// { +// return false; +// } +// return hypre_abs(thrust::get<1>(t1)) > hypre_abs(thrust::get<1>(t2)); +// } +// }; + +// template +// struct TupleComp3 +// { +// typedef thrust::tuple Tuple; + +// bool operator()(const Tuple& t1, const Tuple& t2) +// { +// if (thrust::get<0>(t1) < thrust::get<0>(t2)) +// { +// return true; +// } +// if (thrust::get<0>(t1) > thrust::get<0>(t2)) +// { +// return false; +// } +// if (thrust::get<0>(t2) == thrust::get<1>(t2)) +// { +// return false; +// } +// return thrust::get<0>(t1) == thrust::get<1>(t1) || thrust::get<1>(t1) < thrust::get<1>(t2); +// } +// }; + +// template +// struct is_negative : public thrust::unary_function +// { +// bool operator()(const T &x) +// { +// return (x < 0); +// } +// }; + +// template +// struct is_positive : public thrust::unary_function +// { +// bool operator()(const T &x) +// { +// return (x > 0); +// } +// }; + +// template +// struct is_nonnegative : public thrust::unary_function +// { +// bool operator()(const T &x) +// { +// return (x >= 0); +// } +// }; + +template +struct in_range +{ + T low, up; + in_range(T low_, T up_) { low = low_; up = up_; } + + bool operator()(const T &x) const { return (x >= low && x <= up); } +}; + +// template +// struct out_of_range : public thrust::unary_function +// { +// T low, up; + +// out_of_range(T low_, T up_) { low = low_; up = up_; } + +// bool operator()(const T &x) +// { +// return (x < low || x > up); +// } +// }; + +#ifdef HYPRE_COMPLEX +template::value>::type> +struct less_than +{ + T val; + less_than(T val_) { val = val_; } + bool operator()(const T &x) const { return (hypre_abs(x) < hypre_abs(val)); } +}; +#else +template::value>::type> +struct less_than +{ + T val; + less_than(T val_) { val = val_; } + bool operator()(const T &x) const { return (x < val); } +}; +#endif +// template +// struct modulo : public thrust::unary_function +// { +// T val; + +// modulo(T val_) { val = val_; } + +// T operator()(const T &x) +// { +// return (x % val); +// } +// }; + +// template +// struct equal : public thrust::unary_function +// { +// T val; + +// equal(T val_) { val = val_; } + +// bool operator()(const T &x) +// { +// return (x == val); +// } +// }; + +// struct print_functor +// { +// void operator()(HYPRE_Real val) +// { +// printf("%f\n", val); +// } +// }; + +#endif // #if defined(HYPRE_USING_SYCL) + +//////////////////////////////////////////////////////////////////////////////////////// + #if defined(HYPRE_USING_CUSPARSE) cudaDataType hypre_HYPREComplexToCudaDataType(); @@ -1336,7 +1819,7 @@ struct ReduceSum /* 2nd reduction with only *one* block */ hypre_assert(nblocks >= 0 && nblocks <= 1024); const dim3 gDim(1), bDim(1024); - HYPRE_CUDA_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks ); + HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks ); hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); val += init; } diff --git a/src/utilities/complex.c b/src/utilities/complex.c index eb8dca4f38..59b71bbf56 100644 --- a/src/utilities/complex.c +++ b/src/utilities/complex.c @@ -9,30 +9,52 @@ #ifdef HYPRE_COMPLEX -#include - HYPRE_Complex hypre_conj( HYPRE_Complex value ) { - return conj(value); +#ifdef HYPRE_USING_SYCL + return std::conj(value); +#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + return thrust::conj(value); +#else + return conj(value); +#endif } HYPRE_Real hypre_cabs( HYPRE_Complex value ) { - return cabs(value); +#ifdef HYPRE_USING_SYCL + return std::abs(value); +#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + return thrust::abs(value); +#else + return cabs(value); +#endif } HYPRE_Real hypre_creal( HYPRE_Complex value ) { - return creal(value); +#ifdef HYPRE_USING_SYCL + return std::real(value); +#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + return thrust::real(value); +#else + return creal(value); +#endif } HYPRE_Real hypre_cimag( HYPRE_Complex value ) { - return cimag(value); +#ifdef HYPRE_USING_SYCL + return std::imag(value); +#elif defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + return thrust::imag(value); +#else + return cimag(value); +#endif } -#endif +#endif // HYPRE_COMPLEX diff --git a/src/utilities/device_reducer.h b/src/utilities/device_reducer.h index ed8604e92b..e62d90d213 100644 --- a/src/utilities/device_reducer.h +++ b/src/utilities/device_reducer.h @@ -267,7 +267,7 @@ struct ReduceSum /* 2nd reduction with only *one* block */ hypre_assert(nblocks >= 0 && nblocks <= 1024); const dim3 gDim(1), bDim(1024); - HYPRE_CUDA_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks ); + HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks ); hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); val += init; } diff --git a/src/utilities/device_utils.c b/src/utilities/device_utils.c index f00a23415f..a75f9be6d2 100644 --- a/src/utilities/device_utils.c +++ b/src/utilities/device_utils.c @@ -8,16 +8,185 @@ #include "_hypre_utilities.h" #include "_hypre_utilities.hpp" +// some common kernels for CUDA, HIP and SYCL +#ifdef HYPRE_USING_GPU + +/** + * Get NNZ of each row in d_row_indices and stored the results in d_rownnz + * All pointers are device pointers. + * d_rownnz can be the same as d_row_indices + */ +__global__ void +hypreGPUKernel_GetRowNnz( + #ifdef HYPRE_USING_SYCL + sycl::nd_item<1> item, + #endif + HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, + HYPRE_Int *d_offd_ia, + HYPRE_Int *d_rownnz) +{ +#if defined(HYPRE_USING_SYCL) + const HYPRE_Int global_thread_id = hypre_gpu_get_grid_thread_id<1,1>(item); +#else + const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1, 1>(); +#endif + + if (global_thread_id < nrows) + { + HYPRE_Int i; + + if (d_row_indices) + { + i = read_only_load(&d_row_indices[global_thread_id]); + } + else + { + i = global_thread_id; + } + + d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i + 1]) - read_only_load(&d_diag_ia[i]) + + read_only_load(&d_offd_ia[i + 1]) - read_only_load(&d_offd_ia[i]); + } +} + +HYPRE_Int* +hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind) +{ + HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE); + +#if defined(HYPRE_USING_SYCL) + HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, + d_row_ind, d_row_ind + nnz, + oneapi::dpl::counting_iterator(0), + oneapi::dpl::counting_iterator(nrows + 1), + d_row_ptr); +#else + HYPRE_THRUST_CALL( lower_bound, + d_row_ind, d_row_ind + nnz, + thrust::counting_iterator(0), + thrust::counting_iterator(nrows + 1), + d_row_ptr); +#endif + + return d_row_ptr; +} + +HYPRE_Int +hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, + HYPRE_Int *d_row_ptr) +{ +#if defined(HYPRE_USING_SYCL) + HYPRE_ONEDPL_CALL( oneapi::dpl::lower_bound, + d_row_ind, d_row_ind + nnz, + oneapi::dpl::counting_iterator(0), + oneapi::dpl::counting_iterator(nrows + 1), + d_row_ptr); +#else + HYPRE_THRUST_CALL( lower_bound, + d_row_ind, d_row_ind + nnz, + thrust::counting_iterator(0), + thrust::counting_iterator(nrows + 1), + d_row_ptr); +#endif + + return hypre_error_flag; +} + +HYPRE_Int* +hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr) +{ + /* trivial case */ + if (nrows <= 0 || nnz <= 0) + { + return NULL; + } + + HYPRE_Int *d_row_ind = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE); + + hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, d_row_ind); + + return d_row_ind; +} + +HYPRE_Int +hypreDevice_IntegerReduceSum(HYPRE_Int n, HYPRE_Int *d_i) +{ +#ifdef HYPRE_USING_SYCL + return HYPRE_ONEDPL_CALL(oneapi::dpl::reduce, d_i, d_i + n); +#else + return HYPRE_THRUST_CALL(reduce, d_i, d_i + n); +#endif +} + +HYPRE_Int +hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i) +{ #if defined(HYPRE_USING_SYCL) -sycl::range<1> hypre_GetDefaultDeviceBlockDimension() + HYPRE_ONEDPL_CALL(oneapi::dpl::inclusive_scan, d_i, d_i + n, d_i); +#else + HYPRE_THRUST_CALL(inclusive_scan, d_i, d_i + n, d_i); +#endif + return hypre_error_flag; +} + +HYPRE_Int +hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i) +{ +#if defined(HYPRE_USING_SYCL) + HYPRE_ONEDPL_CALL(std::exclusive_scan, d_i, d_i + n, d_i, 0, std::plus<>()); +#else + HYPRE_THRUST_CALL(exclusive_scan, d_i, d_i + n, d_i); +#endif + return hypre_error_flag; +} + +/* Input: d_row_num, of size nrows, contains the rows indices that can be BigInt or Int + * Output: d_row_ind */ +template +HYPRE_Int +hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, + T *d_row_num, T *d_row_ind) +{ + /* trivial case */ + if (nrows <= 0) + { + return hypre_error_flag; + } + + HYPRE_Int *map = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE); + + hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, map); + +#ifdef HYPRE_USING_SYCL + hypreSycl_gather(map, map + nnz, d_row_num, d_row_ind); +#else + HYPRE_THRUST_CALL(gather, map, map + nnz, d_row_num, d_row_ind); +#endif + + hypre_TFree(map, HYPRE_MEMORY_DEVICE); + + return hypre_error_flag; +} + +template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, + HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind); +#if defined(HYPRE_MIXEDINT) +template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, + HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind); +#endif + +#endif // HYPRE_USING_GPU + +#if defined(HYPRE_USING_SYCL) +dim3 hypre_GetDefaultDeviceBlockDimension() { sycl::range<1> wgDim(hypre_HandleDeviceMaxWorkGroupSize(hypre_handle())); return wgDim; } -sycl::range<1> hypre_GetDefaultDeviceGridDimension(HYPRE_Int n, - const char *granularity, - sycl::range<1> wgDim) +dim3 hypre_GetDefaultDeviceGridDimension(HYPRE_Int n, + const char *granularity, + sycl::range<1> wgDim) { HYPRE_Int num_WGs = 0; HYPRE_Int num_workitems_per_WG = wgDim[0]; @@ -42,7 +211,45 @@ sycl::range<1> hypre_GetDefaultDeviceGridDimension(HYPRE_Int n, return gDim; } -#endif + +struct hypre_empty_row_functor +{ + bool operator()(const std::tuple& t) const + { + const HYPRE_Int a = std::get<0>(t); + const HYPRE_Int b = std::get<1>(t); + return a != b; + } +}; + +HYPRE_Int +hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, + HYPRE_Int *d_row_ind) +{ + /* trivial case */ + if (nrows <= 0 || nnz <= 0) + { + return hypre_error_flag; + } + + HYPRE_ONEDPL_CALL( std::fill, d_row_ind, d_row_ind + nnz, 0 ); + + HYPRE_ONEDPL_CALL( dpct::scatter_if, + oneapi::dpl::counting_iterator(0), + oneapi::dpl::counting_iterator(nrows), + d_row_ptr, + oneapi::dpl::make_transform_iterator( oneapi::dpl::make_zip_iterator(d_row_ptr, d_row_ptr + 1), + hypre_empty_row_functor() ), + d_row_ind, + oneapi::dpl::identity() ); + + HYPRE_ONEDPL_CALL( oneapi::dpl::inclusive_scan, d_row_ind, d_row_ind + nnz, d_row_ind, + sycl::maximum() ); + + return hypre_error_flag; +} + +#endif // HYPRE_USING_SYCL #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) @@ -84,7 +291,7 @@ void hypre_CudaCompileFlagCheck() HYPRE_CUDA_CALL( cudaMalloc(&cuda_arch_compile_d, sizeof(hypre_int)) ); hypre_TMemcpy(cuda_arch_compile_d, &cuda_arch_compile, hypre_int, 1, HYPRE_MEMORY_DEVICE, HYPRE_MEMORY_HOST); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_CompileFlagSafetyCheck, gDim, bDim, cuda_arch_compile_d ); hypre_TMemcpy(&cuda_arch_compile, cuda_arch_compile_d, hypre_int, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE); //hypre_TFree(cuda_arch_compile_d, HYPRE_MEMORY_DEVICE); @@ -150,36 +357,6 @@ hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, return gDim; } -/** - * Get NNZ of each row in d_row_indices and stored the results in d_rownnz - * All pointers are device pointers. - * d_rownnz can be the same as d_row_indices - */ -__global__ void -hypreCUDAKernel_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, - HYPRE_Int *d_offd_ia, - HYPRE_Int *d_rownnz) -{ - const HYPRE_Int global_thread_id = hypre_cuda_get_grid_thread_id<1, 1>(); - - if (global_thread_id < nrows) - { - HYPRE_Int i; - - if (d_row_indices) - { - i = read_only_load(&d_row_indices[global_thread_id]); - } - else - { - i = global_thread_id; - } - - d_rownnz[global_thread_id] = read_only_load(&d_diag_ia[i + 1]) - read_only_load(&d_diag_ia[i]) + - read_only_load(&d_offd_ia[i + 1]) - read_only_load(&d_offd_ia[i]); - } -} - /* special case: if d_row_indices == NULL, it means d_row_indices=[0,1,...,nrows-1] */ HYPRE_Int hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_ia, @@ -195,7 +372,7 @@ hypreDevice_GetRowNnz(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_di return hypre_error_flag; } - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia, + HYPRE_GPU_LAUNCH( hypreGPUKernel_GetRowNnz, gDim, bDim, nrows, d_row_indices, d_diag_ia, d_offd_ia, d_rownnz ); return hypre_error_flag; @@ -335,7 +512,7 @@ hypreDevice_CopyParCSRRows(HYPRE_Int nrows, } */ - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_CopyParCSRRows, gDim, bDim, nrows, d_row_indices, has_offd, first_col, d_col_map_offd_A, d_diag_i, d_diag_j, d_diag_a, d_offd_i, d_offd_j, d_offd_a, @@ -344,28 +521,6 @@ hypreDevice_CopyParCSRRows(HYPRE_Int nrows, return hypre_error_flag; } -HYPRE_Int -hypreDevice_IntegerReduceSum(HYPRE_Int n, HYPRE_Int *d_i) -{ - return HYPRE_THRUST_CALL(reduce, d_i, d_i + n); -} - -HYPRE_Int -hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i) -{ - HYPRE_THRUST_CALL(inclusive_scan, d_i, d_i + n, d_i); - - return hypre_error_flag; -} - -HYPRE_Int -hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i) -{ - HYPRE_THRUST_CALL(exclusive_scan, d_i, d_i + n, d_i); - - return hypre_error_flag; -} - HYPRE_Int hypreDevice_Scalen(HYPRE_Complex *d_x, size_t n, HYPRE_Complex v) { @@ -405,22 +560,6 @@ struct hypre_empty_row_functor } }; -HYPRE_Int* -hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr) -{ - /* trivial case */ - if (nrows <= 0 || nnz <= 0) - { - return NULL; - } - - HYPRE_Int *d_row_ind = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE); - - hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, d_row_ind); - - return d_row_ind; -} - HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_ind) @@ -448,64 +587,6 @@ hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_ return hypre_error_flag; } -/* Input: d_row_num, of size nrows, contains the rows indices that can be BigInt or Int - * Output: d_row_ind */ -template -HYPRE_Int -hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, - T *d_row_num, T *d_row_ind) -{ - /* trivial case */ - if (nrows <= 0) - { - return hypre_error_flag; - } - - HYPRE_Int *map = hypre_TAlloc(HYPRE_Int, nnz, HYPRE_MEMORY_DEVICE); - - hypreDevice_CsrRowPtrsToIndices_v2(nrows, nnz, d_row_ptr, map); - - HYPRE_THRUST_CALL(gather, map, map + nnz, d_row_num, d_row_ind); - - hypre_TFree(map, HYPRE_MEMORY_DEVICE); - - return hypre_error_flag; -} - -template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, - HYPRE_Int *d_row_ptr, HYPRE_Int *d_row_num, HYPRE_Int *d_row_ind); -#if defined(HYPRE_MIXEDINT) -template HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, - HYPRE_Int *d_row_ptr, HYPRE_BigInt *d_row_num, HYPRE_BigInt *d_row_ind); -#endif - -HYPRE_Int* -hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind) -{ - HYPRE_Int *d_row_ptr = hypre_TAlloc(HYPRE_Int, nrows + 1, HYPRE_MEMORY_DEVICE); - - HYPRE_THRUST_CALL( lower_bound, - d_row_ind, d_row_ind + nnz, - thrust::counting_iterator(0), - thrust::counting_iterator(nrows + 1), - d_row_ptr); - - return d_row_ptr; -} - -HYPRE_Int -hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, - HYPRE_Int *d_row_ptr) -{ - HYPRE_THRUST_CALL( lower_bound, - d_row_ind, d_row_ind + nnz, - thrust::counting_iterator(0), - thrust::counting_iterator(nrows + 1), - d_row_ptr); - - return hypre_error_flag; -} - __global__ void hypreCUDAKernel_ScatterAddTrivial(HYPRE_Int n, HYPRE_Real *x, HYPRE_Int *map, HYPRE_Real *y) { @@ -546,7 +627,7 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea /* trivial cases, n = 1, 2 */ dim3 bDim = 1; dim3 gDim = 1; - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterAddTrivial, gDim, bDim, ny, x, map, y ); } else { @@ -585,7 +666,7 @@ hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Rea dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(reduced_n, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim, + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterAdd, gDim, bDim, reduced_n, x, reduced_map, reduced_y ); if (!work) @@ -628,7 +709,7 @@ hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v) dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_ScatterConstant, gDim, bDim, x, n, map, v ); return hypre_error_flag; } @@ -662,7 +743,7 @@ hypreDevice_IVAXPY(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_Comple dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IVAXPY, gDim, bDim, n, a, x, y ); return hypre_error_flag; } @@ -696,7 +777,7 @@ hypreDevice_IVAXPYMarked(HYPRE_Int n, HYPRE_Complex *a, HYPRE_Complex *x, HYPRE_ dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_IVAXPYMarked, gDim, bDim, n, a, x, y, marker, marker_val ); return hypre_error_flag; } @@ -735,7 +816,7 @@ hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_DiagScaleVector, gDim, bDim, n, A_i, A_data, x, beta, y ); return hypre_error_flag; } @@ -770,7 +851,7 @@ hypreDevice_DiagScaleVector2(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(n, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z ); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_DiagScaleVector2, gDim, bDim, n, A_i, A_data, x, beta, y, z ); return hypre_error_flag; } @@ -794,7 +875,7 @@ hypreDevice_BigToSmallCopy(HYPRE_Int *tgt, const HYPRE_BigInt *src, HYPRE_Int si dim3 bDim = hypre_GetDefaultDeviceBlockDimension(); dim3 gDim = hypre_GetDefaultDeviceGridDimension(size, "thread", bDim); - HYPRE_CUDA_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size); + HYPRE_GPU_LAUNCH( hypreCUDAKernel_BigToSmallCopy, gDim, bDim, tgt, src, size); return hypre_error_flag; } @@ -1373,7 +1454,7 @@ hypre_DeviceDataDestroy(hypre_DeviceData *data) } HYPRE_Int -hypre_SyncCudaDevice(hypre_Handle *hypre_handle) +hypre_SyncDevice(hypre_Handle *hypre_handle) { #if defined(HYPRE_USING_DEVICE_OPENMP) HYPRE_CUDA_CALL( cudaDeviceSynchronize() ); @@ -1381,6 +1462,8 @@ hypre_SyncCudaDevice(hypre_Handle *hypre_handle) HYPRE_CUDA_CALL( cudaDeviceSynchronize() ); #elif defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipDeviceSynchronize() ); +#elif defined(HYPRE_USING_SYCL) + HYPRE_SYCL_CALL( hypre_HandleComputeStream(hypre_handle)->wait_and_throw() ); #endif return hypre_error_flag; } @@ -1400,55 +1483,57 @@ hypre_ResetCudaDevice(hypre_Handle *hypre_handle) * action: 0: set sync stream to false * 1: set sync stream to true * 2: restore sync stream to default - * 3: return the current value of cuda_compute_stream_sync - * 4: sync stream based on cuda_compute_stream_sync + * 3: return the current value of device_compute_stream_sync + * 4: sync stream based on device_compute_stream_sync */ HYPRE_Int -hypre_SyncCudaComputeStream_core(HYPRE_Int action, - hypre_Handle *hypre_handle, - HYPRE_Int *cuda_compute_stream_sync_ptr) +hypre_SyncDeviceComputeStream_core(HYPRE_Int action, + hypre_Handle *hypre_handle, + HYPRE_Int *device_compute_stream_sync_ptr) { /* with UVM the default is to sync at kernel completions, since host is also able to * touch GPU memory */ #if defined(HYPRE_USING_UNIFIED_MEMORY) - static const HYPRE_Int cuda_compute_stream_sync_default = 1; + static const HYPRE_Int device_compute_stream_sync_default = 1; #else - static const HYPRE_Int cuda_compute_stream_sync_default = 0; + static const HYPRE_Int device_compute_stream_sync_default = 0; #endif /* this controls if synchronize the stream after computations */ - static HYPRE_Int cuda_compute_stream_sync = cuda_compute_stream_sync_default; + static HYPRE_Int device_compute_stream_sync = device_compute_stream_sync_default; switch (action) { case 0: - cuda_compute_stream_sync = 0; + device_compute_stream_sync = 0; break; case 1: - cuda_compute_stream_sync = 1; + device_compute_stream_sync = 1; break; case 2: - cuda_compute_stream_sync = cuda_compute_stream_sync_default; + device_compute_stream_sync = device_compute_stream_sync_default; break; case 3: - *cuda_compute_stream_sync_ptr = cuda_compute_stream_sync; + *device_compute_stream_sync_ptr = device_compute_stream_sync; break; case 4: #if defined(HYPRE_USING_DEVICE_OPENMP) HYPRE_CUDA_CALL( cudaDeviceSynchronize() ); #else - if (cuda_compute_stream_sync) + if (device_compute_stream_sync) { #if defined(HYPRE_USING_CUDA) HYPRE_CUDA_CALL( cudaStreamSynchronize(hypre_HandleComputeStream(hypre_handle)) ); #elif defined(HYPRE_USING_HIP) HYPRE_HIP_CALL( hipStreamSynchronize(hypre_HandleComputeStream(hypre_handle)) ); +#elif defined(HYPRE_USING_SYCL) + HYPRE_SYCL_CALL( hypre_HandleComputeStream(hypre_handle)->ext_oneapi_submit_barrier() ); #endif } #endif break; default: - hypre_printf("hypre_SyncCudaComputeStream_core invalid action\n"); + hypre_printf("hypre_SyncDeviceComputeStream_core invalid action\n"); hypre_error_in_arg(1); } @@ -1456,35 +1541,35 @@ hypre_SyncCudaComputeStream_core(HYPRE_Int action, } HYPRE_Int -hypre_SetSyncCudaCompute(HYPRE_Int action) +hypre_SetSyncDeviceCompute(HYPRE_Int action) { /* convert to 1/0 */ action = action != 0; - hypre_SyncCudaComputeStream_core(action, NULL, NULL); + hypre_SyncDeviceComputeStream_core(action, NULL, NULL); return hypre_error_flag; } HYPRE_Int -hypre_RestoreSyncCudaCompute() +hypre_RestoreSyncDeviceCompute() { - hypre_SyncCudaComputeStream_core(2, NULL, NULL); + hypre_SyncDeviceComputeStream_core(2, NULL, NULL); return hypre_error_flag; } HYPRE_Int -hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr) +hypre_GetSyncDeviceCompute(HYPRE_Int *device_compute_stream_sync_ptr) { - hypre_SyncCudaComputeStream_core(3, NULL, cuda_compute_stream_sync_ptr); + hypre_SyncDeviceComputeStream_core(3, NULL, device_compute_stream_sync_ptr); return hypre_error_flag; } HYPRE_Int -hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle) +hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle) { - hypre_SyncCudaComputeStream_core(4, hypre_handle, NULL); + hypre_SyncDeviceComputeStream_core(4, hypre_handle, NULL); return hypre_error_flag; } @@ -1550,6 +1635,8 @@ hypre_bind_device( HYPRE_Int myid, /* get number of devices on this node */ hypre_GetDeviceCount(&nDevices); + /* TODO: ABB might need to look into this since nDevices are overwritten by 1 */ + nDevices = 1; /* set device */ device_id = myNodeid % nDevices; @@ -1564,4 +1651,3 @@ hypre_bind_device( HYPRE_Int myid, return hypre_error_flag; } - diff --git a/src/utilities/device_utils.h b/src/utilities/device_utils.h index c019413d85..45006f9097 100644 --- a/src/utilities/device_utils.h +++ b/src/utilities/device_utils.h @@ -53,6 +53,11 @@ #elif defined(HYPRE_USING_SYCL) +typedef sycl::range<1> dim3; +#define __global__ +#define __host__ +#define __device__ + /* WM: problems with this being inside extern C++ {} */ /* #include */ @@ -335,17 +340,39 @@ struct hypre_GpuMatData #define hypre_GpuMatDataMatInfo(data) ((data) -> mat_info) #define hypre_GpuMatDataSpMVBuffer(data) ((data) -> spmv_buffer) +/* device_utils.c, some common functions for CUDA, SYCL, HIP */ + +dim3 hypre_GetDefaultDeviceBlockDimension(); + +dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, + dim3 bDim ); + +HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, + HYPRE_Int *d_row_ind); + +HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); + +HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); + +HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, + HYPRE_Int *d_row_ptr); + +HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i); + +HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); + +HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); + +template +HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, + HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind); + #endif //#if defined(HYPRE_USING_GPU) #if defined(HYPRE_USING_SYCL) /* device_utils.c */ HYPRE_Int HYPRE_SetSYCLDevice(sycl::device user_device); -sycl::range<1> hypre_GetDefaultDeviceBlockDimension(); - -sycl::range<1> hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, - sycl::range<1> bDim ); - #endif // #if defined(HYPRE_USING_SYCL) #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) @@ -386,15 +413,15 @@ using namespace thrust::placeholders; #if defined(HYPRE_DEBUG) #if defined(HYPRE_USING_CUDA) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } #elif defined(HYPRE_USING_HIP) -#define GPU_LAUNCH_SYNC { hypre_SyncCudaComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } +#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_HIP_CALL( hipGetLastError() ); } #endif #else // #if defined(HYPRE_DEBUG) #define GPU_LAUNCH_SYNC #endif // defined(HYPRE_DEBUG) -#define HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \ +#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...) \ { \ if ( gridsize.x == 0 || gridsize.y == 0 || gridsize.z == 0 || \ blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 ) \ @@ -410,7 +437,7 @@ using namespace thrust::placeholders; } \ } -#define HYPRE_CUDA_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_CUDA_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__) +#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__) /* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right * The following one works OK for now */ @@ -945,10 +972,6 @@ hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2, T3 *val template HYPRE_Int hypreDevice_ReduceByTupleKey(HYPRE_Int N, T1 *keys1_in, T2 *keys2_in, T3 *vals_in, T1 *keys1_out, T2 *keys2_out, T3 *vals_out); -template -HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz, - HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind); - template HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v); @@ -960,22 +983,6 @@ HYPRE_Int hypreDevice_CopyParCSRRows(HYPRE_Int nrows, HYPRE_Int *d_row_indices, HYPRE_Int *d_diag_j, HYPRE_Complex *d_diag_a, HYPRE_Int *d_offd_i, HYPRE_Int *d_offd_j, HYPRE_Complex *d_offd_a, HYPRE_Int *d_ib, HYPRE_BigInt *d_jb, HYPRE_Complex *d_ab); -HYPRE_Int hypreDevice_IntegerReduceSum(HYPRE_Int m, HYPRE_Int *d_i); - -HYPRE_Int hypreDevice_IntegerInclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); - -HYPRE_Int hypreDevice_IntegerExclusiveScan(HYPRE_Int n, HYPRE_Int *d_i); - -HYPRE_Int* hypreDevice_CsrRowPtrsToIndices(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr); - -HYPRE_Int hypreDevice_CsrRowPtrsToIndices_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ptr, - HYPRE_Int *d_row_ind); - -HYPRE_Int* hypreDevice_CsrRowIndicesToPtrs(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind); - -HYPRE_Int hypreDevice_CsrRowIndicesToPtrs_v2(HYPRE_Int nrows, HYPRE_Int nnz, HYPRE_Int *d_row_ind, - HYPRE_Int *d_row_ptr); - HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map, HYPRE_Real *y, char *work); @@ -1001,6 +1008,482 @@ void hypre_DeviceDataCubCachingAllocatorDestroy(hypre_DeviceData *data); #endif // #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) +//////////////////////////////////////////////////////////////////////////////////////////////// + +#if defined(HYPRE_USING_SYCL) + +#pragma once + +#include +#include +#include +#include + +#include // dpct::remove_if, remove_copy_if, copy_if, scatter_if + +#include +#include +#include +#include + +#define __forceinline__ __inline__ __attribute__((always_inline)) + +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + * macro for launching SYCL kernels, SYCL, oneDPL, oneMKL calls + * NOTE: IN HYPRE'S DEFAULT STREAM + * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + */ + +template +OutputIter hypreSycl_gather(InputIter1 map_first, InputIter1 map_last, + InputIter2 input_first, OutputIter result) { + static_assert( + std::is_same::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value && + std::is_same< + typename std::iterator_traits::iterator_category, + std::random_access_iterator_tag>::value, + "Iterators passed to algorithms must be random-access iterators."); + auto perm_begin = + oneapi::dpl::make_permutation_iterator(input_first, map_first); + const int n = ::std::distance(map_first, map_last); + + return oneapi::dpl::copy(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())), + perm_begin, perm_begin + n, result); +} + +#if defined(HYPRE_DEBUG) +#if defined(HYPRE_USING_CUDA) +#define GPU_LAUNCH_SYNC { hypre_SyncDeviceComputeStream(hypre_handle()); HYPRE_CUDA_CALL( cudaGetLastError() ); } +#endif +#else // #if defined(HYPRE_DEBUG) +#define GPU_LAUNCH_SYNC +#endif // defined(HYPRE_DEBUG) + +#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) \ +{ \ + if ( gridsize[0] == 0 || blocksize[0] == 0 ) \ + { \ + hypre_printf("Error %s %d: Invalid SYCL 1D launch parameters grid/block (%d) (%d)\n", \ + __FILE__, __LINE__, \ + gridsize[0], blocksize[0]); \ + assert(0); exit(1); \ + } \ + else \ + { \ + hypre_HandleComputeStream(hypre_handle())->parallel_for(sycl::nd_range<1>(gridsize*blocksize, blocksize), \ + [=] (sycl::nd_item<1> item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]] { \ + (kernel_name)(item, __VA_ARGS__); \ + }); \ + } \ +} + +/* RL: TODO Want macro HYPRE_ONEDPL_CALL to return value but I don't know how to do it right + * The following one works OK for now */ + +#define HYPRE_ONEDPL_CALL(func_name, ...) \ + func_name(oneapi::dpl::execution::make_device_policy(*hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__); + +// /* return the number of threads in block */ +// template +// static __forceinline__ +// hypre_int hypre_gpu_get_num_threads() +// { +// switch (dim) +// { +// case 1: +// return (blockDim.x); +// case 2: +// return (blockDim.x * blockDim.y); +// case 3: +// return (blockDim.x * blockDim.y * blockDim.z); +// } + +// return -1; +// } + +/* return the number of (sub_groups) warps in (work-group) block */ +template +static __forceinline__ +hypre_int hypre_gpu_get_num_warps(sycl::nd_item& item) +{ + return item.get_sub_group().get_group_range().get(0); +} + +/* return the thread lane id in warp */ +template +static __forceinline__ +hypre_int hypre_gpu_get_lane_id(sycl::nd_item& item) +{ + return item.get_sub_group().get_local_linear_id(); +} + +// /* return the number of threads in grid */ +// template +// static __forceinline__ +// hypre_int hypre_gpu_get_grid_num_threads() +// { +// return hypre_gpu_get_num_blocks() * hypre_gpu_get_num_threads(); +// } + +/* return the flattened work-item/thread id in global work space, + * Note: Since the use-cases always involved bdim = gdim = 1, the + * sycl:;nd_item<1> is only being used. SFINAE is used to prevent + * other dimensions (i.e., bdim != gdim != 1) */ +template < hypre_int bdim, hypre_int gdim > +static __forceinline__ +hypre_int hypre_gpu_get_grid_thread_id(sycl::nd_item<1>& item) +{ + static_assert(bdim == 1 && gdim == 1); + return item.get_global_id(0); +} + +// /* return the number of warps in grid */ +// template +// static __forceinline__ +// hypre_int hypre_gpu_get_grid_num_warps() +// { +// return hypre_gpu_get_num_blocks() * hypre_gpu_get_num_warps(); +// } + +/* return the flattened warp id in grid */ +template +static __forceinline__ +hypre_int hypre_gpu_get_grid_warp_id(sycl::nd_item<1>& item) +{ + return item.get_group(0) * hypre_gpu_get_num_warps(item) + + item.get_sub_group().get_group_linear_id(); +} + +// #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600 +// static __forceinline__ +// hypre_double atomicAdd(hypre_double* address, hypre_double val) +// { +// hypre_ulonglongint* address_as_ull = (hypre_ulonglongint*) address; +// hypre_ulonglongint old = *address_as_ull, assumed; + +// do { +// assumed = old; +// old = atomicCAS(address_as_ull, assumed, +// __double_as_longlong(val + +// __longlong_as_double(assumed))); + +// // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) +// } while (assumed != old); + +// return __longlong_as_double(old); +// } +// #endif + +template +static __forceinline__ +T read_only_load( const T *ptr ) +{ + return *ptr; +} + +// /* exclusive prefix scan */ +// template +// static __forceinline__ +// T warp_prefix_sum(hypre_int lane_id, T in, T &all_sum) +// { +// #pragma unroll +// for (hypre_int d = 2; d <=HYPRE_WARP_SIZE; d <<= 1) +// { +// T t = __shfl_up_sync(HYPRE_WARP_FULL_MASK, in, d >> 1); +// if ( (lane_id & (d - 1)) == (d - 1) ) +// { +// in += t; +// } +// } + +// all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, HYPRE_WARP_SIZE-1); + +// if (lane_id == HYPRE_WARP_SIZE-1) +// { +// in = 0; +// } + +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// T t = __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d); + +// if ( (lane_id & (d - 1)) == (d - 1)) +// { +// if ( (lane_id & ((d << 1) - 1)) == ((d << 1) - 1) ) +// { +// in += t; +// } +// else +// { +// in = t; +// } +// } +// } +// return in; +// } + +template +static __forceinline__ +T warp_reduce_sum(T in, sycl::nd_item& item) +{ + sycl::sub_group SG = item.get_sub_group(); + //sycl::ext::oneapi::reduce(SG, in, std::plus()); +#pragma unroll + for (hypre_int d = SG.get_local_range().get(0)/2; d > 0; d >>= 1) + { + in += SG.shuffle_down(in, d); + } + return in; +} + +// template +// static __forceinline__ +// T warp_allreduce_sum(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in += __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_reduce_max(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = max(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_allreduce_max(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = max(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_reduce_min(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = min(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// template +// static __forceinline__ +// T warp_allreduce_min(T in) +// { +// #pragma unroll +// for (hypre_int d = HYPRE_WARP_SIZE/2; d > 0; d >>= 1) +// { +// in = min(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d)); +// } +// return in; +// } + +// static __forceinline__ +// hypre_int next_power_of_2(hypre_int n) +// { +// if (n <= 0) +// { +// return 0; +// } + +// /* if n is power of 2, return itself */ +// if ( (n & (n - 1)) == 0 ) +// { +// return n; +// } + +// n |= (n >> 1); +// n |= (n >> 2); +// n |= (n >> 4); +// n |= (n >> 8); +// n |= (n >> 16); +// n ^= (n >> 1); +// n = (n << 1); + +// return n; +// } + +// template +// struct absolute_value : public thrust::unary_function +// { +// T operator()(const T &x) const +// { +// return x < T(0) ? -x : x; +// } +// }; + +// template +// struct TupleComp2 +// { +// typedef thrust::tuple Tuple; + +// bool operator()(const Tuple& t1, const Tuple& t2) +// { +// if (thrust::get<0>(t1) < thrust::get<0>(t2)) +// { +// return true; +// } +// if (thrust::get<0>(t1) > thrust::get<0>(t2)) +// { +// return false; +// } +// return hypre_abs(thrust::get<1>(t1)) > hypre_abs(thrust::get<1>(t2)); +// } +// }; + +// template +// struct TupleComp3 +// { +// typedef thrust::tuple Tuple; + +// bool operator()(const Tuple& t1, const Tuple& t2) +// { +// if (thrust::get<0>(t1) < thrust::get<0>(t2)) +// { +// return true; +// } +// if (thrust::get<0>(t1) > thrust::get<0>(t2)) +// { +// return false; +// } +// if (thrust::get<0>(t2) == thrust::get<1>(t2)) +// { +// return false; +// } +// return thrust::get<0>(t1) == thrust::get<1>(t1) || thrust::get<1>(t1) < thrust::get<1>(t2); +// } +// }; + +// template +// struct is_negative : public thrust::unary_function +// { +// bool operator()(const T &x) +// { +// return (x < 0); +// } +// }; + +// template +// struct is_positive : public thrust::unary_function +// { +// bool operator()(const T &x) +// { +// return (x > 0); +// } +// }; + +// template +// struct is_nonnegative : public thrust::unary_function +// { +// bool operator()(const T &x) +// { +// return (x >= 0); +// } +// }; + +template +struct in_range +{ + T low, up; + in_range(T low_, T up_) { low = low_; up = up_; } + + bool operator()(const T &x) const { return (x >= low && x <= up); } +}; + +// template +// struct out_of_range : public thrust::unary_function +// { +// T low, up; + +// out_of_range(T low_, T up_) { low = low_; up = up_; } + +// bool operator()(const T &x) +// { +// return (x < low || x > up); +// } +// }; + +#ifdef HYPRE_COMPLEX +template::value>::type> +struct less_than +{ + T val; + less_than(T val_) { val = val_; } + bool operator()(const T &x) const { return (hypre_abs(x) < hypre_abs(val)); } +}; +#else +template::value>::type> +struct less_than +{ + T val; + less_than(T val_) { val = val_; } + bool operator()(const T &x) const { return (x < val); } +}; +#endif +// template +// struct modulo : public thrust::unary_function +// { +// T val; + +// modulo(T val_) { val = val_; } + +// T operator()(const T &x) +// { +// return (x % val); +// } +// }; + +// template +// struct equal : public thrust::unary_function +// { +// T val; + +// equal(T val_) { val = val_; } + +// bool operator()(const T &x) +// { +// return (x == val); +// } +// }; + +// struct print_functor +// { +// void operator()(HYPRE_Real val) +// { +// printf("%f\n", val); +// } +// }; + +#endif // #if defined(HYPRE_USING_SYCL) + +//////////////////////////////////////////////////////////////////////////////////////// + #if defined(HYPRE_USING_CUSPARSE) cudaDataType hypre_HYPREComplexToCudaDataType(); diff --git a/src/utilities/general.c b/src/utilities/general.c index c3f7da063f..cd2c49c18b 100644 --- a/src/utilities/general.c +++ b/src/utilities/general.c @@ -72,7 +72,13 @@ hypre_HandleDestroy(hypre_Handle *hypre_handle_) hypre_HandleDeviceData(hypre_handle_) = NULL; #endif +// In debug mode, hypre_TFree() checks the pointer location, which requires the +// hypre_handle_'s compute queue if using sycl. But this was just destroyed above. +#if defined(HYPRE_DEBUG) && defined(HYPRE_USING_SYCL) + free(hypre_handle_); +#else hypre_TFree(hypre_handle_, HYPRE_MEMORY_HOST); +#endif return hypre_error_flag; } @@ -94,7 +100,38 @@ hypre_SetDevice(hypre_int device_id, hypre_Handle *hypre_handle_) HYPRE_HIP_CALL( hipSetDevice(device_id) ); #endif -#if defined(HYPRE_USING_GPU) +#if defined(HYPRE_USING_SYCL) + HYPRE_Int nDevices=0; + hypre_GetDeviceCount(&nDevices); + if (device_id > nDevices) { + hypre_printf("ERROR: SYCL device-ID exceed the number of devices on-node... \n"); + } + + sycl::platform platform(sycl::gpu_selector{}); + auto gpu_devices = platform.get_devices(sycl::info::device_type::gpu); + HYPRE_Int local_nDevices=0; + for (int i = 0; i < gpu_devices.size(); i++) { + // multi-tile GPUs + if (gpu_devices[i].get_info() > 0) { + auto subDevicesDomainNuma = gpu_devices[i].create_sub_devices(sycl::info::partition_affinity_domain::numa); + for (auto &tile : subDevicesDomainNuma) { + if (local_nDevices == device_id) { + hypre_HandleDevice(hypre_handle_) = &tile; + } + local_nDevices++; + } + } + // single-tile GPUs + else { + if (local_nDevices == device_id) { + hypre_HandleDevice(hypre_handle_) = &(gpu_devices[i]); + } + local_nDevices++; + } + } +#endif + +#if defined(HYPRE_USING_GPU) && !defined(HYPRE_USING_SYCL) if (hypre_handle_) { #if defined(HYPRE_USING_SYCL) @@ -417,6 +454,10 @@ HYPRE_PrintDeviceInfo() hypre_printf("Max Compute Units: %d\n", max_compute_units); #endif +#if defined(HYPRE_USING_SYCL) + // WM: TODO +#endif + return hypre_error_flag; } diff --git a/src/utilities/int_array.c b/src/utilities/int_array.c index 7a51fbb80d..65ea3f5ef9 100644 --- a/src/utilities/int_array.c +++ b/src/utilities/int_array.c @@ -168,7 +168,7 @@ hypre_IntArraySetConstantValues( hypre_IntArray *v, #endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */ #if defined(HYPRE_USING_GPU) - hypre_SyncCudaComputeStream(hypre_handle()); + hypre_SyncDeviceComputeStream(hypre_handle()); #endif return ierr; diff --git a/src/utilities/memory.h b/src/utilities/memory.h index bd815020c1..6fcaa29a01 100644 --- a/src/utilities/memory.h +++ b/src/utilities/memory.h @@ -122,6 +122,11 @@ hypre_GetActualMemLocation(HYPRE_MemoryLocation location) #endif } + if (location == HYPRE_MEMORY_UNIFIED) + { + return hypre_MEMORY_UNIFIED; + } + return hypre_MEMORY_UNDEFINED; } diff --git a/src/utilities/protos.h b/src/utilities/protos.h index eb41f99847..ad3b5ff8a8 100644 --- a/src/utilities/protos.h +++ b/src/utilities/protos.h @@ -269,8 +269,8 @@ void hypre_big_sort_and_create_inverse_map(HYPRE_BigInt *in, HYPRE_Int len, HYPR hypre_UnorderedBigIntMap *inverse_map); #if defined(HYPRE_USING_GPU) -HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle); -HYPRE_Int hypre_SyncCudaDevice(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SyncDevice(hypre_Handle *hypre_handle); HYPRE_Int hypre_ResetCudaDevice(hypre_Handle *hypre_handle); HYPRE_Int hypreDevice_DiagScaleVector(HYPRE_Int n, HYPRE_Int *A_i, HYPRE_Complex *A_data, HYPRE_Complex *x, HYPRE_Complex beta, HYPRE_Complex *y); @@ -301,10 +301,10 @@ HYPRE_Int hypre_multmod(HYPRE_Int a, HYPRE_Int b, HYPRE_Int mod); void hypre_partition1D(HYPRE_Int n, HYPRE_Int p, HYPRE_Int j, HYPRE_Int *s, HYPRE_Int *e); char *hypre_strcpy(char *destination, const char *source); -HYPRE_Int hypre_SetSyncCudaCompute(HYPRE_Int action); -HYPRE_Int hypre_RestoreSyncCudaCompute(); -HYPRE_Int hypre_GetSyncCudaCompute(HYPRE_Int *cuda_compute_stream_sync_ptr); -HYPRE_Int hypre_SyncCudaComputeStream(hypre_Handle *hypre_handle); +HYPRE_Int hypre_SetSyncDeviceCompute(HYPRE_Int action); +HYPRE_Int hypre_RestoreSyncDeviceCompute(); +HYPRE_Int hypre_GetSyncDeviceCompute(HYPRE_Int *device_compute_stream_sync_ptr); +HYPRE_Int hypre_SyncDeviceComputeStream(hypre_Handle *hypre_handle); /* handle.c */ HYPRE_Int hypre_SetSpGemmUseCusparse( HYPRE_Int use_cusparse );