Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Seq mv sycl #538

Open
wants to merge 49 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
c07bcfb
Add sycl configure option and memory functionality
Jul 24, 2021
61d0edb
Change names and fix initialization
Jul 27, 2021
bafa6c2
Fix cuda compilation
waynemitchell Jul 27, 2021
c16315d
Choose default exec policy for matvec
Jul 28, 2021
6d9fb5c
Merge branch 'master' into sycl
Jul 28, 2021
c58f944
Start boxloop implementation
Aug 3, 2021
25348d4
Remove nonfunctional code for fresh start
Sep 16, 2021
58b6e23
Add simple driver and remove problematic flag from configure
waynemitchell Sep 17, 2021
0c58ebe
Reproducing invalid kernel name error in simple
waynemitchell Sep 27, 2021
5695c97
boxloop1 running on frank
waynemitchell Sep 29, 2021
2ad440f
Merge branch 'master' into sycl
waynemitchell Sep 29, 2021
f4d9ba4
Resolve further merge conflicts, passes struct tests
waynemitchell Sep 29, 2021
845a433
Non-reduction boxloops done
waynemitchell Sep 30, 2021
c733ad6
Merge branch 'master' into sycl
waynemitchell Sep 30, 2021
4ed00c4
First attempt at reduction boxloops, seg faulting right now
waynemitchell Oct 1, 2021
2fb3f27
Reproducing seg fault when trying to launch trivial reduction paralle…
waynemitchell Oct 5, 2021
001fb9f
Reduction boxloops done
waynemitchell Oct 5, 2021
df301df
Cleanup
waynemitchell Oct 6, 2021
4e54d48
Added hypreLoopBegin/End
waynemitchell Oct 6, 2021
a127622
Bug fix
waynemitchell Oct 19, 2021
94a269d
Fix configuration options for non-unified memory
waynemitchell Oct 25, 2021
39fbd2d
Update oneapi reduction
waynemitchell Oct 26, 2021
193ee25
Bug fix in parallel
waynemitchell Oct 26, 2021
9166c16
Additional macro fixes and implementation of redblack relax
waynemitchell Oct 26, 2021
4fca1be
Automatic selection of block dimension
waynemitchell Oct 27, 2021
a6383e8
zboxloop
liruipeng Oct 27, 2021
4ddcc4a
Fixes for compiler update on jlse
waynemitchell Oct 29, 2021
345b0d0
Renamings
waynemitchell Oct 29, 2021
f48eec0
Try different formulation of reduction
waynemitchell Oct 29, 2021
980bee5
Autoconf clean up
waynemitchell Nov 2, 2021
2d5ee90
Cleanup boxloops, renamings, make sure tests compile
waynemitchell Nov 2, 2021
496afa7
Merge branch 'master' into sycl
waynemitchell Nov 2, 2021
4d303d3
Some placeholders and changes to allow ij interface to run on the host
waynemitchell Nov 3, 2021
99c5d9d
Add cmake compilation
waynemitchell Nov 4, 2021
ec8c5de
Some code cleanup
waynemitchell Nov 4, 2021
3254e31
[SYCL] convert sycl::device to sycl::device* for better handling (#504)
abagusetty Nov 5, 2021
68fc8be
[SYCL] add complex types for device
abagusetty Dec 7, 2021
b7ebf4e
[SYCL] kernel launch macro
abagusetty Dec 8, 2021
31d6238
Merge branch 'master' into seq_mv_sycl
abagusetty Dec 8, 2021
35fa901
[SYCL] changes to function, var names from _cuda_ to _device_ for uni…
abagusetty Dec 8, 2021
243e2b8
[SYCL] update, unify new functions for CUDA and SYCL in csr_matop_device
abagusetty Dec 9, 2021
9eb1f7f
[SYCL] enable oneDPL and some more updates
abagusetty Dec 10, 2021
f11b593
[SYCL] adding sycl::gather and few more common GPU functions
abagusetty Dec 14, 2021
ae30f74
[SYCL] fix the sycl scatter_if
abagusetty Dec 14, 2021
c73ef06
[SYCL] fix the build issues from std::exclusive_scan, lambda for scat…
abagusetty Dec 14, 2021
d3e3bf0
[SYCL] cleanup a for SYCL kernel query helper functions
abagusetty Dec 16, 2021
ad32b6f
[SYCL] simplify namespace for sycl::ext::oneapi::sub_group to sycl::s…
abagusetty Dec 21, 2021
9c6b6bc
[SYCL] unify code for CUDA, HIP and SYCL for easier maintanence
abagusetty Dec 21, 2021
bcf0e57
fix complex data types preprocessor for CUDA, HIP
abagusetty Dec 23, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/IJ_mv/IJMatrix_parcsr_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ hypre_IJMatrixSetAddValuesParCSRDevice( hypre_IJMatrix *matrix,
/* mark unwanted elements as -1 */
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(len1, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixValues_dev1, gDim, bDim, len1, indicator,
(HYPRE_Int *) row_indexes, ncols, indicator );

auto new_end = HYPRE_THRUST_CALL(
Expand Down Expand Up @@ -233,7 +233,7 @@ hypre_IJMatrixAssembleSortAndReduce1(HYPRE_Int N0, HYPRE_BigInt *I0, HYPRE_Big
/*
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(N0, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 );
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJMatrixAssembleSortAndReduce1, gDim, bDim, N0, I0, J0, X0, A0 );
*/

/* output X: 0: keep, 1: zero-out */
Expand Down
2 changes: 1 addition & 1 deletion src/IJ_mv/IJVector_parcsr_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ hypre_IJVectorAssembleParDevice(hypre_IJVector *vector)
/* set/add to local vector */
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(new_nnz, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_IJVectorAssemblePar, gDim, bDim, new_nnz, new_data, new_i,
vec_start, new_sora,
hypre_VectorData(hypre_ParVectorLocalVector(par_vector)) );

Expand Down
2 changes: 1 addition & 1 deletion src/config/configure.in
Original file line number Diff line number Diff line change
Expand Up @@ -2316,7 +2316,7 @@ AS_IF([test x"$hypre_using_sycl" == x"yes"],

if test "$hypre_user_chose_cuflags" = "no"
then
CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
CUFLAGS="-D_GLIBCXX_USE_TBB_PAR_BACKEND=0 -fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
if test "$hypre_using_debug" = "yes"
then
CUFLAGS="-O0 -Wall -g ${CUFLAGS}"
Expand Down
2 changes: 1 addition & 1 deletion src/configure
Original file line number Diff line number Diff line change
Expand Up @@ -9143,7 +9143,7 @@ $as_echo "#define HYPRE_USING_SYCL 1" >>confdefs.h

if test "$hypre_user_chose_cuflags" = "no"
then
CUFLAGS="-fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
CUFLAGS="-D_GLIBCXX_USE_TBB_PAR_BACKEND=0 -fsycl -fsycl-unnamed-lambda -fsycl-device-code-split=per_kernel"
if test "$hypre_using_debug" = "yes"
then
CUFLAGS="-O0 -Wall -g ${CUFLAGS}"
Expand Down
2 changes: 1 addition & 1 deletion src/distributed_matrix/distributed_matrix_parcsr.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ hypre_DistributedMatrixGetRowParCSR( hypre_DistributedMatrix *matrix,

// RL: if HYPRE_ParCSRMatrixGetRow was on device, need the next line to guarantee it's done
#if defined(HYPRE_USING_GPU)
hypre_SyncCudaComputeStream(hypre_handle());
hypre_SyncDeviceComputeStream(hypre_handle());
#endif

return(ierr);
Expand Down
12 changes: 6 additions & 6 deletions src/parcsr_ls/ads.c
Original file line number Diff line number Diff line change
Expand Up @@ -627,12 +627,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nnz, "thread", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
F2V_diag_nnz, 3, F2V_diag_J, Pi_diag_J );

gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data,
Pi_diag_data );
}
Expand Down Expand Up @@ -693,12 +693,12 @@ HYPRE_Int hypre_ADSComputePi(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nnz, "thread", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
F2V_offd_nnz, 3, F2V_offd_J, Pi_offd_J );

gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data,
Pi_offd_data );
}
Expand Down Expand Up @@ -907,7 +907,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_diag_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
F2V_diag_nrows, 3, F2V_diag_I, NULL, RT100_data, RT010_data, RT001_data,
Pix_diag_data, Piy_diag_data, Piz_diag_data );
}
Expand Down Expand Up @@ -987,7 +987,7 @@ HYPRE_Int hypre_ADSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(F2V_offd_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
F2V_offd_nrows, 3, F2V_offd_I, NULL, RT100_data, RT010_data, RT001_data,
Pix_offd_data, Piy_offd_data, Piz_offd_data );
}
Expand Down
2 changes: 1 addition & 1 deletion src/parcsr_ls/ame.c
Original file line number Diff line number Diff line change
Expand Up @@ -496,7 +496,7 @@ HYPRE_Int hypre_AMESetup(void *esolver)
{
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_GtEliminateBoundary, gDim, bDim,
nv, GtdI, GtdJ, GtdA, GtoI, GtoJ, GtoA, edge_bc, offd_edge_bc );
}
else
Expand Down
42 changes: 21 additions & 21 deletions src/parcsr_ls/ams.c
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ HYPRE_Int hypre_ParVectorBlockSplit(hypre_ParVector *x,
{
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<0>, gDim, bDim,
size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
}
else
Expand Down Expand Up @@ -241,7 +241,7 @@ HYPRE_Int hypre_ParVectorBlockGather(hypre_ParVector *x,
{
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(size_ * dim, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParVectorBlockSplitGather<1>, gDim, bDim,
size_, dim, x_data_[0], x_data_[1], x_data_[2], x_data);
}
else
Expand Down Expand Up @@ -456,10 +456,10 @@ HYPRE_Int hypre_ParCSRMatrixFixZeroRowsDevice(hypre_ParCSRMatrix *A)
bDim = hypre_GetDefaultDeviceBlockDimension();
gDim = hypre_GetDefaultDeviceGridDimension(nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
HYPRE_GPU_LAUNCH(hypreCUDAKernel_ParCSRMatrixFixZeroRows, gDim, bDim,
nrows, A_diag_i, A_diag_j, A_diag_data, A_offd_i, A_offd_data, num_cols_offd);

//hypre_SyncCudaComputeStream(hypre_handle());
//hypre_SyncDeviceComputeStream(hypre_handle());

return hypre_error_flag;
}
Expand Down Expand Up @@ -787,7 +787,7 @@ HYPRE_Int hypre_ParCSRMatrixSetDiagRows(hypre_ParCSRMatrix *A, HYPRE_Real d)
{
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(num_rows, "thread", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_ParCSRMatrixSetDiagRows, gDim, bDim,
num_rows, A_diag_I, A_diag_J, A_diag_data, A_offd_I, num_cols_offd, d);
}
else
Expand Down Expand Up @@ -1623,12 +1623,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
G_diag_nnz, dim, G_diag_J, Pi_diag_J );

gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
Pi_diag_data );
}
Expand Down Expand Up @@ -1696,12 +1696,12 @@ HYPRE_Int hypre_AMSComputePi(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
G_offd_nnz, dim, G_offd_J, Pi_offd_J );

gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy2, gDim, bDim,
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
Pi_offd_data );
}
Expand Down Expand Up @@ -1944,7 +1944,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
Pix_diag_data, Piy_diag_data, Piz_diag_data );
}
Expand Down Expand Up @@ -2010,7 +2010,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, NULL,
Pix_diag_data, Piy_diag_data, NULL );
}
Expand Down Expand Up @@ -2068,7 +2068,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, NULL, NULL,
Pix_diag_data, NULL, NULL );
}
Expand Down Expand Up @@ -2145,7 +2145,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
Pix_offd_data, Piy_offd_data, Piz_offd_data );
}
Expand Down Expand Up @@ -2227,7 +2227,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, NULL,
Pix_offd_data, Piy_offd_data, NULL );
}
Expand Down Expand Up @@ -2299,7 +2299,7 @@ HYPRE_Int hypre_AMSComputePixyz(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePixyz_copy, gDim, bDim,
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, NULL, NULL,
Pix_offd_data, NULL, NULL );
}
Expand Down Expand Up @@ -2501,12 +2501,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nnz, "thread", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
G_diag_nnz, dim, G_diag_J, GPi_diag_J );

gDim = hypre_GetDefaultDeviceGridDimension(G_diag_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
G_diag_nrows, dim, G_diag_I, G_diag_data, Gx_data, Gy_data, Gz_data,
GPi_diag_data );
}
Expand Down Expand Up @@ -2575,12 +2575,12 @@ HYPRE_Int hypre_AMSComputeGPi(hypre_ParCSRMatrix *A,
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nnz, "thread", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputePi_copy1, gDim, bDim,
G_offd_nnz, dim, G_offd_J, GPi_offd_J );

gDim = hypre_GetDefaultDeviceGridDimension(G_offd_nrows, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSComputeGPi_copy2, gDim, bDim,
G_offd_nrows, dim, G_offd_I, G_offd_data, Gx_data, Gy_data, Gz_data,
GPi_offd_data );
}
Expand Down Expand Up @@ -2815,7 +2815,7 @@ HYPRE_Int hypre_AMSSetup(void *solver,
{
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(nv, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_FixInterNodes, gDim, bDim,
nv, G0tdI, G0tdA, G0toI, G0toA, interior_nodes_data );
}
else
Expand Down Expand Up @@ -3401,7 +3401,7 @@ HYPRE_Int hypre_AMSSetup(void *solver,
{
dim3 bDim = hypre_GetDefaultDeviceBlockDimension();
dim3 gDim = hypre_GetDefaultDeviceGridDimension(Gt_num_rows, "warp", bDim);
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_AMSSetupScaleGGt, gDim, bDim,
Gt_num_rows, Gt_diag_I, Gt_diag_J, Gt_diag_data, Gt_offd_I, Gt_offd_data,
Gx_data, Gy_data, Gz_data );
}
Expand Down
10 changes: 5 additions & 5 deletions src/parcsr_ls/par_2s_interp_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix *A,
dim3 gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim);

/* only for rows corresponding to F2 (notice flag == -1) */
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
gDim, bDim,
A_nr_local,
A_offd_nnz > 0,
Expand Down Expand Up @@ -160,7 +160,7 @@ hypre_BoomerAMGBuildModPartialExtInterpDevice( hypre_ParCSRMatrix *A,
* diagnoally scale As_F2F (from both sides) and replace the diagonal */
gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMInterpScaleAFF,
gDim, bDim,
AF2F_nr_local,
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)),
Expand Down Expand Up @@ -329,7 +329,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A,
dlam = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE);
dtmp = hypre_TAlloc(HYPRE_Complex, AFC_nr_local, HYPRE_MEMORY_DEVICE);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_dlam_dtmp,
gDim, bDim,
AFC_nr_local,
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_FF)),
Expand Down Expand Up @@ -388,7 +388,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A,
gDim = hypre_GetDefaultDeviceGridDimension(A_nr_local, "warp", bDim);

/* only for rows corresponding to F2 (notice flag == -1) */
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_compute_weak_rowsums,
gDim, bDim,
A_nr_local,
A_offd_nnz > 0,
Expand Down Expand Up @@ -438,7 +438,7 @@ hypre_BoomerAMGBuildModPartialExtPEInterpDevice( hypre_ParCSRMatrix *A,
* diagnoally scale As_F2F (from both sides) and replace the diagonal */
gDim = hypre_GetDefaultDeviceGridDimension(AF2F_nr_local, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_MMPEInterpScaleAFF,
gDim, bDim,
AF2F_nr_local,
hypre_CSRMatrixI(hypre_ParCSRMatrixDiag(As_F2F)),
Expand Down
4 changes: 2 additions & 2 deletions src/parcsr_ls/par_coarsen_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ hypre_PMISCoarseningInitDevice( hypre_ParCSRMatrix *S, /* in */
HYPRE_Int *new_end;

/* init CF_marker_diag and measure_diag: remove some special nodes */
HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningInit, gDim, bDim,
num_rows_diag, CF_init, S_diag_i, S_offd_i, measure_diag, CF_marker_diag );

/* communicate for measure_offd */
Expand Down Expand Up @@ -494,7 +494,7 @@ hypre_PMISCoarseningUpdateCFDevice( hypre_ParCSRMatrix *S, /* in
bDim = hypre_GetDefaultDeviceBlockDimension();
gDim = hypre_GetDefaultDeviceGridDimension(graph_diag_size, "warp", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF,
HYPRE_GPU_LAUNCH( hypreCUDAKernel_PMISCoarseningUpdateCF,
gDim, bDim,
graph_diag_size,
graph_diag,
Expand Down
2 changes: 1 addition & 1 deletion src/parcsr_ls/par_gauss_elim.c
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ HYPRE_Int hypre_dgemv_device(HYPRE_Int m, HYPRE_Int n, HYPRE_Int lda, HYPRE_Real
dim3 bDim(BLOCK_SIZE, 1, 1);
dim3 gDim = hypre_GetDefaultDeviceGridDimension(m, "thread", bDim);

HYPRE_CUDA_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y );
HYPRE_GPU_LAUNCH( hypreCUDAKernel_dgemv, gDim, bDim, m, n, lda, a, x, y );

return hypre_error_flag;
}
Expand Down
Loading