From aadb6e35e8968a9ef1728b0f9dfed1901536e029 Mon Sep 17 00:00:00 2001 From: James Sandham <33790278+jsandham@users.noreply.github.com> Date: Tue, 29 Aug 2023 11:42:20 -0600 Subject: [PATCH] Cherrypick SWDEV-408046 (#594) (#349) * SWDEV-408046 (#594) solves SWDEV-408046 * Clang formatting --------- Co-authored-by: Yvan Mokwinski Co-authored-by: jsandham --- library/src/extra/bsrgemm_device.h | 19 +++--- library/src/extra/csrgemm_device.h | 20 +++--- .../src/extra/rocsparse_csrgemm_numeric.cpp | 18 +++--- .../src/extra/rocsparse_csrgemm_symbolic.cpp | 6 +- library/src/include/common.h | 63 ++++++++++++++++--- library/src/level2/bsrsv_device.h | 8 +-- library/src/level2/coomv_device.h | 22 +++---- library/src/level2/csrmv_device.h | 8 +-- library/src/level2/csrmv_symm_device.h | 26 ++++---- library/src/level2/csrsv_device.h | 10 +-- library/src/level2/ellmv_device.h | 4 +- library/src/level2/rocsparse_coomv.cpp | 14 ++--- .../src/level2/rocsparse_csritsv_analysis.cpp | 14 ++--- .../src/level2/rocsparse_csritsv_solve.cpp | 4 +- library/src/level3/bsrsm_device_large.h | 4 +- library/src/level3/csrmm_device.h | 20 +++--- library/src/level3/csrsm_device.h | 2 +- .../rocsparse_coomm_template_atomic.cpp | 26 ++++---- ...sparse_coomm_template_segmented_atomic.cpp | 24 ++++--- library/src/precond/bsric0_device.h | 32 +++++----- library/src/precond/bsrilu0_device.h | 10 +-- library/src/precond/csric0_device.h | 8 +-- library/src/precond/csrilu0_device.h | 8 +-- library/src/precond/itilu0/common.cpp | 10 +-- .../rocsparse_csritilu0_async_inplace.cpp | 8 +-- .../itilu0/rocsparse_csritilu0x_async.cpp | 4 +- .../itilu0/rocsparse_csritilu0x_sync.cpp | 2 +- .../rocsparse_csritilu0x_sync_fusion.cpp | 6 +- 28 files changed, 229 insertions(+), 171 deletions(-) diff --git a/library/src/extra/bsrgemm_device.h b/library/src/extra/bsrgemm_device.h index 86cdd8f2..85b44d80 100644 --- a/library/src/extra/bsrgemm_device.h +++ b/library/src/extra/bsrgemm_device.h @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ - * Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved. + * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -188,16 +188,16 @@ static __device__ __forceinline__ void insert_pair_rxc( if(table[hash] == key) { // Element already present, add value to exsiting entry - atomicAdd(&data[BLOCKDIM * BLOCKDIM * hash + BLOCKDIM * row + col], val); + rocsparse_atomic_add(&data[BLOCKDIM * BLOCKDIM * hash + BLOCKDIM * row + col], val); break; } else if(table[hash] == empty) { // If empty, add element with atomic - if(atomicCAS(&table[hash], empty, key) == empty) + if(rocsparse_atomic_cas(&table[hash], empty, key) == empty) { // Add value - atomicAdd(&data[BLOCKDIM * BLOCKDIM * hash + BLOCKDIM * row + col], val); + rocsparse_atomic_add(&data[BLOCKDIM * BLOCKDIM * hash + BLOCKDIM * row + col], val); break; } } @@ -1320,9 +1320,10 @@ __device__ void bsrgemm_block_per_row_atomic_multipass_device(rocsparse_directio } } - atomicAdd(&data[BLOCKDIM * BLOCKDIM * (col_B - chunk_begin) - + BLOCKDIM * r + c], - alpha * val_AB); + rocsparse_atomic_add( + &data[BLOCKDIM * BLOCKDIM * (col_B - chunk_begin) + BLOCKDIM * r + + c], + alpha * val_AB); } } else if(col_B >= chunk_end) @@ -1373,7 +1374,7 @@ __device__ void bsrgemm_block_per_row_atomic_multipass_device(rocsparse_directio val_D = beta * bsr_val_D[block_dim * block_dim * j + block_dim * c + r]; } - atomicAdd( + rocsparse_atomic_add( &data[BLOCKDIM * BLOCKDIM * (col_D - chunk_begin) + BLOCKDIM * r + c], val_D); } @@ -1391,7 +1392,7 @@ __device__ void bsrgemm_block_per_row_atomic_multipass_device(rocsparse_directio { // Atomically determine the new chunks beginning (minimum column index of B // that is larger than the current chunks end point) - atomicMin(&next_chunk, min_col); + rocsparse_atomic_min(&next_chunk, min_col); } // Wait for all threads to finish diff --git a/library/src/extra/csrgemm_device.h b/library/src/extra/csrgemm_device.h index d987d82c..3fb810bb 100644 --- a/library/src/extra/csrgemm_device.h +++ b/library/src/extra/csrgemm_device.h @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ - * Copyright (C) 2019-2022 Advanced Micro Devices, Inc. All rights Reserved. + * Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -346,7 +346,7 @@ static __device__ __forceinline__ bool insert_key(I key, I* __restrict__ table) else if(table[hash] == -1) { // If empty, add element with atomic - if(atomicCAS(&table[hash], -1, key) == -1) + if(rocsparse_atomic_cas(&table[hash], -1, key) == -1) { // Increment number of insertions return true; @@ -376,16 +376,16 @@ static __device__ __forceinline__ void if(table[hash] == key) { // Element already present, add value to exsiting entry - atomicAdd(&data[hash], val); + rocsparse_atomic_add(&data[hash], val); break; } else if(table[hash] == empty) { // If empty, add element with atomic - if(atomicCAS(&table[hash], empty, key) == empty) + if(rocsparse_atomic_cas(&table[hash], empty, key) == empty) { // Add value - atomicAdd(&data[hash], val); + rocsparse_atomic_add(&data[hash], val); break; } } @@ -792,7 +792,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL { // Atomically determine the new chunks beginning (minimum column index of B // that is larger than the current chunks end point) - atomicMin(&next_chunk, min_col); + rocsparse_atomic_min(&next_chunk, min_col); } // Wait for all threads to finish row nnz operation @@ -812,7 +812,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(lid == WFSIZE - 1) { // Atomically add this chunks nnz to the total row nnz - atomicAdd(&nnz, chunk_nnz); + rocsparse_atomic_add(&nnz, chunk_nnz); } // Wait for atomics to be processed @@ -1316,7 +1316,7 @@ __device__ void csrgemm_fill_block_per_row_multipass_device(J n, table[col_B - chunk_begin] = 1; // Atomically accumulate the intermediate products - atomicAdd(&data[col_B - chunk_begin], val_A * csr_val_B[k]); + rocsparse_atomic_add(&data[col_B - chunk_begin], val_A * csr_val_B[k]); } else if(col_B >= chunk_end) { @@ -1360,7 +1360,7 @@ __device__ void csrgemm_fill_block_per_row_multipass_device(J n, table[col_D - chunk_begin] = 1; // Atomically accumulate the entry of D - atomicAdd(&data[col_D - chunk_begin], beta * csr_val_D[j]); + rocsparse_atomic_add(&data[col_D - chunk_begin], beta * csr_val_D[j]); } else if(col_D >= chunk_end) { @@ -1382,7 +1382,7 @@ __device__ void csrgemm_fill_block_per_row_multipass_device(J n, { // Atomically determine the new chunks beginning (minimum column index of B // that is larger than the current chunks end point) - atomicMin(&next_chunk, min_col); + rocsparse_atomic_min(&next_chunk, min_col); } // Wait for all threads to finish diff --git a/library/src/extra/rocsparse_csrgemm_numeric.cpp b/library/src/extra/rocsparse_csrgemm_numeric.cpp index e041833a..b1eb0a07 100644 --- a/library/src/extra/rocsparse_csrgemm_numeric.cpp +++ b/library/src/extra/rocsparse_csrgemm_numeric.cpp @@ -82,7 +82,7 @@ static __device__ __forceinline__ bool insert_key(I key, I* __restrict__ table) else if(table[hash] == -1) { // If empty, add element with atomic - if(atomicCAS(&table[hash], -1, key) == -1) + if(rocsparse_atomic_cas(&table[hash], -1, key) == -1) { // Increment number of insertions return true; @@ -116,8 +116,8 @@ static __device__ __forceinline__ bool } else if(table[hash] == -1) { - atomicCAS(&table[hash], -1, key); - atomicCAS(&local_idxs[hash], -1, local_idx); + rocsparse_atomic_cas(&table[hash], -1, key); + rocsparse_atomic_cas(&local_idxs[hash], -1, local_idx); return true; } else @@ -142,16 +142,16 @@ static __device__ __forceinline__ void if(table[hash] == key) { // Element already present, add value to exsiting entry - atomicAdd(&data[hash], val); + rocsparse_atomic_add(&data[hash], val); break; } else if(table[hash] == empty) { // If empty, add element with atomic - if(atomicCAS(&table[hash], empty, key) == empty) + if(rocsparse_atomic_cas(&table[hash], empty, key) == empty) { // Add value - atomicAdd(&data[hash], val); + rocsparse_atomic_add(&data[hash], val); break; } } @@ -647,7 +647,7 @@ __device__ void table[col_B - chunk_begin] = 1; // Atomically accumulate the intermediate products - atomicAdd(&data[col_B - chunk_begin], val_A * csr_val_B[k]); + rocsparse_atomic_add(&data[col_B - chunk_begin], val_A * csr_val_B[k]); } else if(col_B >= chunk_end) { @@ -691,7 +691,7 @@ __device__ void table[col_D - chunk_begin] = 1; // Atomically accumulate the entry of D - atomicAdd(&data[col_D - chunk_begin], beta * csr_val_D[j]); + rocsparse_atomic_add(&data[col_D - chunk_begin], beta * csr_val_D[j]); } else if(col_D >= chunk_end) { @@ -713,7 +713,7 @@ __device__ void { // Atomically determine the new chunks beginning (minimum column index of B // that is larger than the current chunks end point) - atomicMin(&next_chunk, min_col); + rocsparse_atomic_min(&next_chunk, min_col); } // Wait for all threads to finish diff --git a/library/src/extra/rocsparse_csrgemm_symbolic.cpp b/library/src/extra/rocsparse_csrgemm_symbolic.cpp index 111bbb83..00c4eedd 100644 --- a/library/src/extra/rocsparse_csrgemm_symbolic.cpp +++ b/library/src/extra/rocsparse_csrgemm_symbolic.cpp @@ -198,7 +198,7 @@ __device__ void table[col_D - chunk_begin] = 1; // Atomically accumulate the entry of D - // atomicAdd(&data[col_D - chunk_begin], beta * csr_val_D[j]); + // rocsparse_atomic_add(&data[col_D - chunk_begin], beta * csr_val_D[j]); } else if(col_D >= chunk_end) { @@ -220,7 +220,7 @@ __device__ void { // Atomically determine the new chunks beginning (minimum column index of B // that is larger than the current chunks end point) - atomicMin(&next_chunk, min_col); + rocsparse_atomic_min(&next_chunk, min_col); } // Wait for all threads to finish @@ -574,7 +574,7 @@ static __device__ __forceinline__ bool insert_key(I key, I* __restrict__ table, else if(table[hash] == empty) { // If empty, add element with atomic - if(atomicCAS(&table[hash], empty, key) == empty) + if(rocsparse_atomic_cas(&table[hash], empty, key) == empty) { // Increment number of insertions return true; diff --git a/library/src/include/common.h b/library/src/include/common.h index 74c4a3a2..b765cc90 100644 --- a/library/src/include/common.h +++ b/library/src/include/common.h @@ -168,22 +168,71 @@ __device__ __forceinline__ double rocsparse_shfl(double var, int src_lane, int w __device__ __forceinline__ rocsparse_float_complex rocsparse_shfl(rocsparse_float_complex var, int src_lane, int width = warpSize) { return rocsparse_float_complex(__shfl(std::real(var), src_lane, width), __shfl(std::imag(var), src_lane, width)); } __device__ __forceinline__ rocsparse_double_complex rocsparse_shfl(rocsparse_double_complex var, int src_lane, int width = warpSize) { return rocsparse_double_complex(__shfl(std::real(var), src_lane, width), __shfl(std::imag(var), src_lane, width)); } -__device__ __forceinline__ int64_t atomicMin(int64_t* ptr, int64_t val) { return atomicMin((unsigned long long*)ptr, val); } -__device__ __forceinline__ int64_t atomicMax(int64_t* ptr, int64_t val) { return atomicMax((unsigned long long*)ptr, val); } -__device__ __forceinline__ int64_t atomicAdd(int64_t* ptr, int64_t val) { return atomicAdd((unsigned long long*)ptr, val); } -__device__ __forceinline__ int64_t atomicCAS(int64_t* ptr, int64_t cmp, int64_t val) { return atomicCAS((unsigned long long*)ptr, cmp, val); } +template +__device__ __forceinline__ T rocsparse_atomic_min(T * ptr, T val) +{ + return atomicMin(ptr,val); +} + +template +__device__ __forceinline__ T rocsparse_atomic_max(T * ptr, T val) +{ + return atomicMax(ptr,val); +} + +template +__device__ __forceinline__ T rocsparse_atomic_add(T * ptr, T val) +{ + return atomicAdd(ptr,val); +} -__device__ __forceinline__ rocsparse_float_complex atomicAdd(rocsparse_float_complex* ptr, rocsparse_float_complex val) +template +__device__ __forceinline__ T rocsparse_atomic_cas(T * ptr, T cmp, T val) +{ + return atomicCAS(ptr, cmp, val); +} + + + +template <> +__device__ __forceinline__ int64_t rocsparse_atomic_min(int64_t * ptr, int64_t val) +{ + return atomicMin((unsigned long long*)ptr, (unsigned long long)val); +} + +template <> +__device__ __forceinline__ int64_t rocsparse_atomic_max(int64_t * ptr, int64_t val) +{ + return atomicMax((unsigned long long*)ptr, val); +} + + +template <> +__device__ __forceinline__ int64_t rocsparse_atomic_add(int64_t * ptr, int64_t val) +{ + return atomicAdd((unsigned long long*)ptr, val); +} + +template <> +__device__ __forceinline__ rocsparse_float_complex rocsparse_atomic_add(rocsparse_float_complex* ptr, rocsparse_float_complex val) { return rocsparse_float_complex(atomicAdd((float*)ptr, std::real(val)), atomicAdd((float*)ptr + 1, std::imag(val))); } -__device__ __forceinline__ rocsparse_double_complex atomicAdd(rocsparse_double_complex* ptr, rocsparse_double_complex val) + +template <> +__device__ __forceinline__ rocsparse_double_complex rocsparse_atomic_add(rocsparse_double_complex* ptr, rocsparse_double_complex val) { return rocsparse_double_complex(atomicAdd((double*)ptr, std::real(val)), atomicAdd((double*)ptr + 1, std::imag(val))); } +template <> +__device__ __forceinline__ int64_t rocsparse_atomic_cas(int64_t* ptr, int64_t cmp, int64_t val) +{ + return atomicCAS((unsigned long long*)ptr, cmp, val); +} + __device__ __forceinline__ bool rocsparse_is_inf(float val){ return (val == std::numeric_limits::infinity()); } __device__ __forceinline__ bool rocsparse_is_inf(double val){ return (val == std::numeric_limits::infinity()); } __device__ __forceinline__ bool rocsparse_is_inf(rocsparse_float_complex val){ return (std::real(val) == std::numeric_limits::infinity() || std::imag(val) == std::numeric_limits::infinity()); } @@ -955,7 +1004,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tid == 0) { - atomicMax(max_nnz, shared[0]); + rocsparse_atomic_max(max_nnz, shared[0]); } } diff --git a/library/src/level2/bsrsv_device.h b/library/src/level2/bsrsv_device.h index c564c63f..8863cf7a 100644 --- a/library/src/level2/bsrsv_device.h +++ b/library/src/level2/bsrsv_device.h @@ -171,7 +171,7 @@ static ROCSPARSE_DEVICE_ILF void if(pivot == true) { - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } } } @@ -321,7 +321,7 @@ static ROCSPARSE_DEVICE_ILF void if(pivot == true) { - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } } } @@ -503,7 +503,7 @@ static ROCSPARSE_DEVICE_ILF void // Find the minimum pivot, if applicable if(pivot == true) { - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } } } @@ -685,7 +685,7 @@ static ROCSPARSE_DEVICE_ILF void // Find the minimum pivot, if applicable if(pivot == true) { - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } } } diff --git a/library/src/level2/coomv_device.h b/library/src/level2/coomv_device.h index 5af35184..007ade2f 100644 --- a/library/src/level2/coomv_device.h +++ b/library/src/level2/coomv_device.h @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. + * Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -466,7 +466,7 @@ static ROCSPARSE_DEVICE_ILF void coomvn_atomic_loops_device(int64_t nnz, { if(row != shared_row[tid + 1] && row >= 0) { - atomicAdd(&y[row], alpha * val); + rocsparse_atomic_add(&y[row], alpha * val); } } @@ -503,7 +503,7 @@ static ROCSPARSE_DEVICE_ILF void coomvn_atomic_loops_device(int64_t nnz, } else if(prevrow >= 0) { - atomicAdd(&y[prevrow], alpha * shared_val[BLOCKSIZE - 1]); + rocsparse_atomic_add(&y[prevrow], alpha * shared_val[BLOCKSIZE - 1]); } } @@ -531,7 +531,7 @@ static ROCSPARSE_DEVICE_ILF void coomvn_atomic_loops_device(int64_t nnz, { if(row != shared_row[tid + 1] && row >= 0) { - atomicAdd(&y[row], alpha * val); + rocsparse_atomic_add(&y[row], alpha * val); } } } @@ -541,7 +541,7 @@ static ROCSPARSE_DEVICE_ILF void coomvn_atomic_loops_device(int64_t nnz, { if(row >= 0) { - atomicAdd(&y[row], alpha * val); + rocsparse_atomic_add(&y[row], alpha * val); } } } @@ -609,7 +609,7 @@ static ROCSPARSE_DEVICE_ILF void coomvn_aos_atomic_loops_device(int64_t nnz, { if(row != shared_row[tid + 1] && row >= 0) { - atomicAdd(&y[row], alpha * val); + rocsparse_atomic_add(&y[row], alpha * val); } } @@ -644,7 +644,7 @@ static ROCSPARSE_DEVICE_ILF void coomvn_aos_atomic_loops_device(int64_t nnz, } else if(prevrow >= 0) { - atomicAdd(&y[prevrow], alpha * shared_val[BLOCKSIZE - 1]); + rocsparse_atomic_add(&y[prevrow], alpha * shared_val[BLOCKSIZE - 1]); } } @@ -672,7 +672,7 @@ static ROCSPARSE_DEVICE_ILF void coomvn_aos_atomic_loops_device(int64_t nnz, { if(row != shared_row[tid + 1] && row >= 0) { - atomicAdd(&y[row], alpha * val); + rocsparse_atomic_add(&y[row], alpha * val); } } } @@ -681,7 +681,7 @@ static ROCSPARSE_DEVICE_ILF void coomvn_aos_atomic_loops_device(int64_t nnz, { if(row >= 0) { - atomicAdd(&y[row], alpha * val); + rocsparse_atomic_add(&y[row], alpha * val); } } } @@ -709,7 +709,7 @@ static ROCSPARSE_DEVICE_ILF void coomvt_device(rocsparse_operation trans, A val = (trans == rocsparse_operation_conjugate_transpose) ? rocsparse_conj(coo_val[gid]) : coo_val[gid]; - atomicAdd(&y[col], alpha * val * x[row]); + rocsparse_atomic_add(&y[col], alpha * val * x[row]); } template @@ -734,5 +734,5 @@ static ROCSPARSE_DEVICE_ILF void coomvt_aos_device(rocsparse_operation trans, A val = (trans == rocsparse_operation_conjugate_transpose) ? rocsparse_conj(coo_val[gid]) : coo_val[gid]; - atomicAdd(&y[col], alpha * val * x[row]); + rocsparse_atomic_add(&y[col], alpha * val * x[row]); } diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index c42d6b00..8666ec50 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. + * Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -134,7 +134,7 @@ static ROCSPARSE_DEVICE_ILF void csrmvt_general_device(bool conj J col = csr_col_ind[j] - idx_base; A val = conj_val(csr_val[j], conj); - atomicAdd(&y[col], row_val * val); + rocsparse_atomic_add(&y[col], row_val * val); } } } @@ -444,7 +444,7 @@ static ROCSPARSE_DEVICE_ILF void csrmvn_adaptive_device(bool con // The first workgroup will eventually flip this flag, and you can move forward. __syncthreads(); while(gid != first_wg_in_row && lid == 0 - && ((atomicMax(&wg_flags[first_wg_in_row], 0U)) == compare_value)) + && ((rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U)) == compare_value)) ; __syncthreads(); @@ -473,7 +473,7 @@ static ROCSPARSE_DEVICE_ILF void csrmvn_adaptive_device(bool con if(lid == 0) { - atomicAdd(y + row, partialSums[0]); + rocsparse_atomic_add(y + row, partialSums[0]); } } } diff --git a/library/src/level2/csrmv_symm_device.h b/library/src/level2/csrmv_symm_device.h index 2d759a13..662d4d43 100644 --- a/library/src/level2/csrmv_symm_device.h +++ b/library/src/level2/csrmv_symm_device.h @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ - * Copyright (C) 2021-2022 Advanced Micro Devices, Inc. All rights Reserved. + * Copyright (C) 2021-2023 Advanced Micro Devices, Inc. All rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -122,7 +122,7 @@ static ROCSPARSE_DEVICE_ILF void csrmvt_symm_general_device(bool if(col != row) { A val = conj_val(csr_val[j], conj); - atomicAdd(&y[col], row_val * val); + rocsparse_atomic_add(&y[col], row_val * val); } } } @@ -321,10 +321,10 @@ static ROCSPARSE_DEVICE_ILF void csrmvn_symm_adaptive_device(bool if((myCol != myRow) && (col + i) < (csr_row_ptr[stop_row] - idx_base)) { if(myCol >= (stop_cols_idx) && myCol < stop_row) - atomicAdd(&cols_in_rows[myCol - (stop_cols_idx)], - (partial_sums[lid + i] * x[myRow])); + rocsparse_atomic_add(&cols_in_rows[myCol - (stop_cols_idx)], + (partial_sums[lid + i] * x[myRow])); else - atomicAdd(&y[myCol], (partial_sums[lid + i] * x[myRow])); + rocsparse_atomic_add(&y[myCol], (partial_sums[lid + i] * x[myRow])); } // For the lower triangular, the matrix value is already in partial_sums. @@ -348,10 +348,10 @@ static ROCSPARSE_DEVICE_ILF void csrmvn_symm_adaptive_device(bool if((myCol != myRow) && (col + i) < (csr_row_ptr[stop_row] - idx_base)) { if(myCol >= (stop_cols_idx) && myCol < stop_row) - atomicAdd(&cols_in_rows[myCol - (stop_cols_idx)], - (partial_sums[lid + i] * x[myRow])); + rocsparse_atomic_add(&cols_in_rows[myCol - (stop_cols_idx)], + (partial_sums[lid + i] * x[myRow])); else - atomicAdd(&y[myCol], (partial_sums[lid + i] * x[myRow])); + rocsparse_atomic_add(&y[myCol], (partial_sums[lid + i] * x[myRow])); } // For the lower triangular, the matrix value is already in partial_sums. @@ -367,7 +367,7 @@ static ROCSPARSE_DEVICE_ILF void csrmvn_symm_adaptive_device(bool for(I l = lid; l < (end_cols_idx - (stop_row - row)); l += WG_SIZE) { - atomicAdd(&y[stop_cols_idx + l], cols_in_rows[l]); + rocsparse_atomic_add(&y[stop_cols_idx + l], cols_in_rows[l]); } __syncthreads(); @@ -424,7 +424,7 @@ static ROCSPARSE_DEVICE_ILF void csrmvn_symm_adaptive_device(bool temp += cols_in_rows[lid + (end_cols_idx - (stop_row - row))]; // sum from upper triangular matrix - atomicAdd(&y[row + lid], temp); + rocsparse_atomic_add(&y[row + lid], temp); } } else @@ -449,7 +449,7 @@ static ROCSPARSE_DEVICE_ILF void csrmvn_symm_adaptive_device(bool // put that into the output for each row. temp += cols_in_rows[end_cols_idx - stop_row + local_row]; // sum from upper triangular matrix - atomicAdd(&y[local_row], temp); + rocsparse_atomic_add(&y[local_row], temp); local_row += hipBlockDim_x; } } @@ -525,7 +525,7 @@ static ROCSPARSE_DEVICE_ILF void csrmvn_symm_adaptive_device(bool // Write result if(t == 0) { - atomicAdd(&y[myRow], alpha * partial_sums[0]); + rocsparse_atomic_add(&y[myRow], alpha * partial_sums[0]); } myRow++; } @@ -539,7 +539,7 @@ static ROCSPARSE_DEVICE_ILF void csrmvn_symm_adaptive_device(bool J myCol = csr_col_ind[j] - idx_base; if(myCol != myRow2) { - atomicAdd(&y[myCol], (alpha * conj_val(csr_val[j], conj) * x[myRow2])); + rocsparse_atomic_add(&y[myCol], (alpha * conj_val(csr_val[j], conj) * x[myRow2])); } } } diff --git a/library/src/level2/csrsv_device.h b/library/src/level2/csrsv_device.h index e101244c..5b1a2cee 100644 --- a/library/src/level2/csrsv_device.h +++ b/library/src/level2/csrsv_device.h @@ -172,12 +172,12 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL atomicOr(&done_array[row], local_max + 1); // Obtain maximum nnz - atomicMax(max_nnz, row_end - row_begin); + rocsparse_atomic_max(max_nnz, row_end - row_begin); if(csr_diag_ind[row] == -1 && diag_type == rocsparse_diag_type_non_unit) { // We are looking for the first zero pivot - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } } } @@ -326,12 +326,12 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL atomicOr(&done_array[row], local_max + 1); // Obtain maximum nnz - atomicMax(max_nnz, row_end - row_begin); + rocsparse_atomic_max(max_nnz, row_end - row_begin); if(csr_diag_ind[row] == -1 && diag_type == rocsparse_diag_type_non_unit) { // We are looking for the first zero pivot - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } } } @@ -402,7 +402,7 @@ __device__ void csrsv_device(J m, { // Numerical zero pivot found, avoid division by 0 // and store index for later use. - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); local_val = static_cast(1); } diff --git a/library/src/level2/ellmv_device.h b/library/src/level2/ellmv_device.h index d57a92bc..cbc0b77f 100644 --- a/library/src/level2/ellmv_device.h +++ b/library/src/level2/ellmv_device.h @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. + * Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -124,7 +124,7 @@ static __device__ void ellmvt_device(rocsparse_operation trans, val = rocsparse_conj(val); } - atomicAdd(&y[col], row_val * val); + rocsparse_atomic_add(&y[col], row_val * val); } else { diff --git a/library/src/level2/rocsparse_coomv.cpp b/library/src/level2/rocsparse_coomv.cpp index 6ec872d9..9f505312 100644 --- a/library/src/level2/rocsparse_coomv.cpp +++ b/library/src/level2/rocsparse_coomv.cpp @@ -243,7 +243,7 @@ rocsparse_status rocsparse_coomv_analysis_template(rocsparse_handle han RETURN_IF_ROCSPARSE_ERROR(rocsparse_coo2csr_template( handle, coo_row_ind, (I)nnz, m, csr_row_ptr, descr->base)); - hipLaunchKernelGGL((csr_max_nnz_per_row<256>), + hipLaunchKernelGGL((csr_max_nnz_per_row<256, I, I>), dim3((m - 1) / 256 + 1), dim3(256), 0, @@ -264,7 +264,7 @@ rocsparse_status rocsparse_coomv_analysis_template(rocsparse_handle han } else { - I* max_nnz = nullptr; + int64_t* max_nnz = nullptr; int64_t* csr_row_ptr = nullptr; RETURN_IF_HIP_ERROR( rocsparse_hipMallocAsync((void**)&max_nnz, sizeof(I), handle->stream)); @@ -276,7 +276,7 @@ rocsparse_status rocsparse_coomv_analysis_template(rocsparse_handle han RETURN_IF_ROCSPARSE_ERROR( rocsparse_coo2csr_template(handle, coo_row_ind, nnz, m, csr_row_ptr, descr->base)); - hipLaunchKernelGGL((csr_max_nnz_per_row<256>), + hipLaunchKernelGGL((csr_max_nnz_per_row<256, int64_t, int64_t>), dim3((m - 1) / 256 + 1), dim3(256), 0, @@ -285,11 +285,9 @@ rocsparse_status rocsparse_coomv_analysis_template(rocsparse_handle han csr_row_ptr, max_nnz); - RETURN_IF_HIP_ERROR(hipMemcpyAsync(&descr->max_nnz_per_row, - max_nnz, - sizeof(I), - hipMemcpyDeviceToHost, - handle->stream)); + int64_t local_max_nnz; + RETURN_IF_HIP_ERROR(hipMemcpyAsync( + &local_max_nnz, max_nnz, sizeof(int64_t), hipMemcpyDeviceToHost, handle->stream)); RETURN_IF_HIP_ERROR(hipStreamSynchronize(handle->stream)); RETURN_IF_HIP_ERROR(rocsparse_hipFreeAsync(max_nnz, handle->stream)); diff --git a/library/src/level2/rocsparse_csritsv_analysis.cpp b/library/src/level2/rocsparse_csritsv_analysis.cpp index 3ae5c33f..83541c6b 100644 --- a/library/src/level2/rocsparse_csritsv_analysis.cpp +++ b/library/src/level2/rocsparse_csritsv_analysis.cpp @@ -43,9 +43,9 @@ __launch_bounds__(BLOCKSIZE) static __global__ const J c = (((ind_[ptr_diag_[tid] - base_ + ptr_shift_] - base_) != tid) ? 1 : 0); if(c > 0) { - const J p = (tid + base_); - atomicMin(position, p); - atomicAdd(count, c); + const rocsparse_int p = (tid + base_); + rocsparse_atomic_min(position, p); + rocsparse_atomic_add(count, c); } } } @@ -66,9 +66,9 @@ __launch_bounds__(BLOCKSIZE) static __global__ const J c = (((ind_[ptr_[tid + shift] - shift - base_] - base_) != tid) ? 1 : 0); if(c > 0) { - const J p = (tid + base_); - atomicMin(position, p); - atomicAdd(count, c); + const rocsparse_int p = (tid + base_); + rocsparse_atomic_min(position, p); + rocsparse_atomic_add(count, c); } } } @@ -88,7 +88,7 @@ __launch_bounds__(BLOCKSIZE) static __global__ const J c = (((ind_[ptr_[tid + shift] - shift - base_] - base_) == tid) ? 1 : 0); if(c > 0) { - atomicAdd(count, c); + rocsparse_atomic_add(count, c); } } } diff --git a/library/src/level2/rocsparse_csritsv_solve.cpp b/library/src/level2/rocsparse_csritsv_solve.cpp index 4e97a3af..be54b69f 100644 --- a/library/src/level2/rocsparse_csritsv_solve.cpp +++ b/library/src/level2/rocsparse_csritsv_solve.cpp @@ -76,13 +76,13 @@ struct calculator_inverse_diagonal_t } else { - atomicMin(zero_pivot, tid + base); + rocsparse_atomic_min(zero_pivot, tid + base); invdiag[tid] = static_cast(1); } } else { - atomicMin(zero_pivot, tid + base); + rocsparse_atomic_min(zero_pivot, tid + base); invdiag[tid] = static_cast(1); } } diff --git a/library/src/level3/bsrsm_device_large.h b/library/src/level3/bsrsm_device_large.h index d6c71742..81f59721 100644 --- a/library/src/level3/bsrsm_device_large.h +++ b/library/src/level3/bsrsm_device_large.h @@ -183,7 +183,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(pivot == true) { - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } } } @@ -345,7 +345,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(pivot == true) { - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } } } diff --git a/library/src/level3/csrmm_device.h b/library/src/level3/csrmm_device.h index 9188a861..bd6e857d 100644 --- a/library/src/level3/csrmm_device.h +++ b/library/src/level3/csrmm_device.h @@ -485,16 +485,18 @@ static ROCSPARSE_DEVICE_ILF void csrmmtn_general_device(bool conj_A, { for(J i = 0; i < WF_SIZE && (i + hipBlockIdx_y * WF_SIZE) < N; ++i) { - atomicAdd(&C[col + (i + hipBlockIdx_y * WF_SIZE) * ldc + batch_stride_C * batch], - val * shared_B[wid][i]); + rocsparse_atomic_add( + &C[col + (i + hipBlockIdx_y * WF_SIZE) * ldc + batch_stride_C * batch], + val * shared_B[wid][i]); } } else { for(J i = 0; i < WF_SIZE && (i + hipBlockIdx_y * WF_SIZE) < N; ++i) { - atomicAdd(&C[col * ldc + (i + hipBlockIdx_y * WF_SIZE) + batch_stride_C * batch], - val * shared_B[wid][i]); + rocsparse_atomic_add( + &C[col * ldc + (i + hipBlockIdx_y * WF_SIZE) + batch_stride_C * batch], + val * shared_B[wid][i]); } } } @@ -559,16 +561,18 @@ static ROCSPARSE_DEVICE_ILF void csrmmtt_general_device(bool conj_A, { for(J i = 0; i < WF_SIZE && (i + hipBlockIdx_y * WF_SIZE) < N; ++i) { - atomicAdd(&C[col + (i + hipBlockIdx_y * WF_SIZE) * ldc + batch_stride_C * batch], - val * shared_B[wid][i]); + rocsparse_atomic_add( + &C[col + (i + hipBlockIdx_y * WF_SIZE) * ldc + batch_stride_C * batch], + val * shared_B[wid][i]); } } else { for(J i = 0; i < WF_SIZE && (i + hipBlockIdx_y * WF_SIZE) < N; ++i) { - atomicAdd(&C[col * ldc + (i + hipBlockIdx_y * WF_SIZE) + batch_stride_C * batch], - val * shared_B[wid][i]); + rocsparse_atomic_add( + &C[col * ldc + (i + hipBlockIdx_y * WF_SIZE) + batch_stride_C * batch], + val * shared_B[wid][i]); } } } diff --git a/library/src/level3/csrsm_device.h b/library/src/level3/csrsm_device.h index 97da841f..1b035d6a 100644 --- a/library/src/level3/csrsm_device.h +++ b/library/src/level3/csrsm_device.h @@ -117,7 +117,7 @@ __device__ void csrsm_device(rocsparse_operation transB, // and store index for later use. if(hipThreadIdx_x == 0) { - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } local_val = static_cast(1); diff --git a/library/src/level3/rocsparse_coomm_template_atomic.cpp b/library/src/level3/rocsparse_coomm_template_atomic.cpp index 9542652b..767a2ee8 100644 --- a/library/src/level3/rocsparse_coomm_template_atomic.cpp +++ b/library/src/level3/rocsparse_coomm_template_atomic.cpp @@ -76,14 +76,16 @@ static ROCSPARSE_DEVICE_ILF void coommnn_atomic_main_device(bool conj_A, { for(I p = 0; p < LOOPS; p++) { - atomicAdd(&C[(colB + p * WF_SIZE) * ldc + current_row], alpha * sum[p]); + rocsparse_atomic_add(&C[(colB + p * WF_SIZE) * ldc + current_row], + alpha * sum[p]); } } else { for(I p = 0; p < LOOPS; p++) { - atomicAdd(&C[current_row * ldc + colB + p * WF_SIZE], alpha * sum[p]); + rocsparse_atomic_add(&C[current_row * ldc + colB + p * WF_SIZE], + alpha * sum[p]); } } @@ -117,14 +119,14 @@ static ROCSPARSE_DEVICE_ILF void coommnn_atomic_main_device(bool conj_A, { for(I p = 0; p < LOOPS; p++) { - atomicAdd(&C[(colB + p * WF_SIZE) * ldc + current_row], alpha * sum[p]); + rocsparse_atomic_add(&C[(colB + p * WF_SIZE) * ldc + current_row], alpha * sum[p]); } } else { for(I p = 0; p < LOOPS; p++) { - atomicAdd(&C[current_row * ldc + colB + p * WF_SIZE], alpha * sum[p]); + rocsparse_atomic_add(&C[current_row * ldc + colB + p * WF_SIZE], alpha * sum[p]); } } } @@ -178,11 +180,11 @@ static ROCSPARSE_DEVICE_ILF void coommnn_atomic_remainder_device(bool conj_A, { if(order == rocsparse_order_column) { - atomicAdd(&C[colB * ldc + current_row], sum); + rocsparse_atomic_add(&C[colB * ldc + current_row], sum); } else { - atomicAdd(&C[current_row * ldc + colB], sum); + rocsparse_atomic_add(&C[current_row * ldc + colB], sum); } } @@ -238,11 +240,11 @@ static ROCSPARSE_DEVICE_ILF void coommnn_atomic_remainder_device(bool conj_A, { if(order == rocsparse_order_column) { - atomicAdd(&C[(l + swid) * ldc + current_row], sum); + rocsparse_atomic_add(&C[(l + swid) * ldc + current_row], sum); } else { - atomicAdd(&C[current_row * ldc + (l + swid)], sum); + rocsparse_atomic_add(&C[current_row * ldc + (l + swid)], sum); } } } @@ -256,11 +258,11 @@ static ROCSPARSE_DEVICE_ILF void coommnn_atomic_remainder_device(bool conj_A, { if(order == rocsparse_order_column) { - atomicAdd(&C[(l + swid) * ldc + current_row], sum); + rocsparse_atomic_add(&C[(l + swid) * ldc + current_row], sum); } else { - atomicAdd(&C[current_row * ldc + (l + swid)], sum); + rocsparse_atomic_add(&C[current_row * ldc + (l + swid)], sum); } } } @@ -308,11 +310,11 @@ static __device__ void coommtn_atomic_device(bool conj_A, if(order == rocsparse_order_column) { - atomicAdd(&C[hipBlockIdx_y * ldc + col], alpha * (val * bval)); + rocsparse_atomic_add(&C[hipBlockIdx_y * ldc + col], alpha * (val * bval)); } else { - atomicAdd(&C[col * ldc + hipBlockIdx_y], alpha * (val * bval)); + rocsparse_atomic_add(&C[col * ldc + hipBlockIdx_y], alpha * (val * bval)); } } diff --git a/library/src/level3/rocsparse_coomm_template_segmented_atomic.cpp b/library/src/level3/rocsparse_coomm_template_segmented_atomic.cpp index eeb48d17..ae64dc9b 100644 --- a/library/src/level3/rocsparse_coomm_template_segmented_atomic.cpp +++ b/library/src/level3/rocsparse_coomm_template_segmented_atomic.cpp @@ -146,16 +146,18 @@ static ROCSPARSE_DEVICE_ILF void coommnn_segmented_atomic_device(rocsparse_opera { for(I p = 0; p < COLS; p++) { - atomicAdd(&C[prevrow + (col_offset + p) * ldc + batch_stride_C * batch], - shared_val[p][WF_SIZE - 1]); + rocsparse_atomic_add( + &C[prevrow + (col_offset + p) * ldc + batch_stride_C * batch], + shared_val[p][WF_SIZE - 1]); } } else { for(I p = 0; p < COLS; p++) { - atomicAdd(&C[(col_offset + p) + prevrow * ldc + batch_stride_C * batch], - shared_val[p][WF_SIZE - 1]); + rocsparse_atomic_add( + &C[(col_offset + p) + prevrow * ldc + batch_stride_C * batch], + shared_val[p][WF_SIZE - 1]); } } } @@ -205,16 +207,16 @@ static ROCSPARSE_DEVICE_ILF void coommnn_segmented_atomic_device(rocsparse_opera { for(I p = 0; p < COLS; p++) { - atomicAdd(&C[row + (col_offset + p) * ldc + batch_stride_C * batch], - val[p]); + rocsparse_atomic_add( + &C[row + (col_offset + p) * ldc + batch_stride_C * batch], val[p]); } } else { for(I p = 0; p < COLS; p++) { - atomicAdd(&C[(col_offset + p) + row * ldc + batch_stride_C * batch], - val[p]); + rocsparse_atomic_add( + &C[(col_offset + p) + row * ldc + batch_stride_C * batch], val[p]); } } } @@ -230,14 +232,16 @@ static ROCSPARSE_DEVICE_ILF void coommnn_segmented_atomic_device(rocsparse_opera { for(I p = 0; p < COLS; p++) { - atomicAdd(&C[row + (col_offset + p) * ldc + batch_stride_C * batch], val[p]); + rocsparse_atomic_add(&C[row + (col_offset + p) * ldc + batch_stride_C * batch], + val[p]); } } else { for(I p = 0; p < COLS; p++) { - atomicAdd(&C[(col_offset + p) + row * ldc + batch_stride_C * batch], val[p]); + rocsparse_atomic_add(&C[(col_offset + p) + row * ldc + batch_stride_C * batch], + val[p]); } } } diff --git a/library/src/precond/bsric0_device.h b/library/src/precond/bsric0_device.h index fd6fb7a6..61d1c952 100644 --- a/library/src/precond/bsric0_device.h +++ b/library/src/precond/bsric0_device.h @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ -* Copyright (C) 2020-2022 Advanced Micro Devices, Inc. All rights Reserved. +* Copyright (C) 2020-2023 Advanced Micro Devices, Inc. All rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -66,7 +66,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tidx == 0 && tidy == 0) { - atomicMin(zero_pivot, block_row + idx_base); + rocsparse_atomic_min(zero_pivot, block_row + idx_base); // Last lane in wavefront writes "we are done" flag for its block row atomicOr(&block_done[block_row], 1); @@ -301,7 +301,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tidx == 0 && tidy == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, block_col + idx_base); + rocsparse_atomic_min(zero_pivot, block_col + idx_base); } diag_val = static_cast(1); @@ -371,7 +371,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tidx == 0 && tidy == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, block_row + idx_base); + rocsparse_atomic_min(zero_pivot, block_row + idx_base); } // Normally would break here but to avoid divergence set diag_val to one and continue @@ -457,7 +457,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tidx == 0 && tidy == 0) { - atomicMin(zero_pivot, block_row + idx_base); + rocsparse_atomic_min(zero_pivot, block_row + idx_base); // Last lane in wavefront writes "we are done" flag for its block row atomicOr(&block_done[block_row], 1); @@ -618,7 +618,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tidx == 0 && tidy == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, block_col + idx_base); + rocsparse_atomic_min(zero_pivot, block_col + idx_base); } diag_val = static_cast(1); @@ -697,7 +697,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tidx == 0 && tidy == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, block_row + idx_base); + rocsparse_atomic_min(zero_pivot, block_row + idx_base); } // Normally would break here but to avoid divergence set diag_val to one and continue @@ -791,7 +791,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tidx == 0 && tidy == 0) { - atomicMin(zero_pivot, block_row + idx_base); + rocsparse_atomic_min(zero_pivot, block_row + idx_base); // Last lane in wavefront writes "we are done" flag for its block row atomicOr(&block_done[block_row], 1); @@ -961,7 +961,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tidx == 0 && tidy == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, block_col + idx_base); + rocsparse_atomic_min(zero_pivot, block_col + idx_base); } diag_val = static_cast(1); @@ -1054,7 +1054,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tidx == 0 && tidy == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, block_row + idx_base); + rocsparse_atomic_min(zero_pivot, block_row + idx_base); } // Normally would break here but to avoid divergence set diag_val to one and continue @@ -1151,7 +1151,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tidx == 0 && tidy == 0) { - atomicMin(zero_pivot, block_row + idx_base); + rocsparse_atomic_min(zero_pivot, block_row + idx_base); // Last lane in wavefront writes "we are done" flag for its block row atomicOr(&block_done[block_row], 1); @@ -1303,7 +1303,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tidx == 0 && tidy == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, block_col + idx_base); + rocsparse_atomic_min(zero_pivot, block_col + idx_base); } diag_val = static_cast(1); @@ -1405,7 +1405,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(tidx == 0 && tidy == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, block_row + idx_base); + rocsparse_atomic_min(zero_pivot, block_row + idx_base); } // Normally would break here but to avoid divergence set diag_val to one and continue @@ -1493,7 +1493,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(lid == WFSIZE - 1) { - atomicMin(zero_pivot, block_row + idx_base); + rocsparse_atomic_min(zero_pivot, block_row + idx_base); // Last lane in wavefront writes "we are done" flag for its block row atomicOr(&block_done[block_row], 1); @@ -1568,7 +1568,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(lid == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, block_col + idx_base); + rocsparse_atomic_min(zero_pivot, block_col + idx_base); } // Normally would break here but to avoid divergence set diag_val to one and continue @@ -1683,7 +1683,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(lid == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, block_row + idx_base); + rocsparse_atomic_min(zero_pivot, block_row + idx_base); } // Normally would break here but to avoid divergence set diag_val to one and continue diff --git a/library/src/precond/bsrilu0_device.h b/library/src/precond/bsrilu0_device.h index 07f704a6..7cb59fe4 100644 --- a/library/src/precond/bsrilu0_device.h +++ b/library/src/precond/bsrilu0_device.h @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ - * Copyright (C) 2020-2022 Advanced Micro Devices, Inc. All rights Reserved. + * Copyright (C) 2020-2023 Advanced Micro Devices, Inc. All rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -316,7 +316,7 @@ __device__ void bsrilu0_2_8_device(rocsparse_direction dir, if(pivot) { // Atomically set minimum zero pivot, if found - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } } } @@ -643,7 +643,7 @@ __device__ void bsrilu0_9_32_device(rocsparse_direction dir, if(pivot) { // Atomically set minimum zero pivot, if found - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } } } @@ -950,7 +950,7 @@ __device__ void bsrilu0_33_64_device(rocsparse_direction dir, if(pivot) { // Atomically set minimum zero pivot, if found - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } } } @@ -1203,7 +1203,7 @@ __device__ void bsrilu0_general_device(rocsparse_direction dir, if(pivot) { // Atomically set minimum zero pivot, if found - atomicMin(zero_pivot, row + idx_base); + rocsparse_atomic_min(zero_pivot, row + idx_base); } } } diff --git a/library/src/precond/csric0_device.h b/library/src/precond/csric0_device.h index 98aa183d..f02cf98f 100644 --- a/library/src/precond/csric0_device.h +++ b/library/src/precond/csric0_device.h @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ -* Copyright (C) 2020-2022 Advanced Micro Devices, Inc. All rights Reserved. +* Copyright (C) 2020-2023 Advanced Micro Devices, Inc. All rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -95,7 +95,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL // key is already inserted, done break; } - else if(atomicCAS(&table[hash], -1, key) == -1) + else if(rocsparse_atomic_cas(&table[hash], -1, key) == -1) { // inserted key into the table, done data[hash] = j; @@ -151,7 +151,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(lid == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, local_col + idx_base); + rocsparse_atomic_min(zero_pivot, local_col + idx_base); } // Skip this row if it has a zero pivot @@ -320,7 +320,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL if(lid == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, local_col + idx_base); + rocsparse_atomic_min(zero_pivot, local_col + idx_base); } // Skip this row if it has a zero pivot diff --git a/library/src/precond/csrilu0_device.h b/library/src/precond/csrilu0_device.h index 3a4b114f..95464094 100644 --- a/library/src/precond/csrilu0_device.h +++ b/library/src/precond/csrilu0_device.h @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ - * Copyright (C) 2018-2022 Advanced Micro Devices, Inc. All rights Reserved. + * Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -95,7 +95,7 @@ __device__ void csrilu0_hash_kernel(rocsparse_int m, // key is already inserted, done break; } - else if(atomicCAS(&table[hash], -1, key) == -1) + else if(rocsparse_atomic_cas(&table[hash], -1, key) == -1) { // inserted key into the table, done data[hash] = j; @@ -162,7 +162,7 @@ __device__ void csrilu0_hash_kernel(rocsparse_int m, if(lid == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, local_col + idx_base); + rocsparse_atomic_min(zero_pivot, local_col + idx_base); } // Skip this row if it has a zero pivot @@ -320,7 +320,7 @@ __device__ void csrilu0_binsearch_kernel(rocsparse_int m_, if(lid == 0) { // We are looking for the first zero pivot - atomicMin(zero_pivot, local_col + idx_base); + rocsparse_atomic_min(zero_pivot, local_col + idx_base); } // Skip this row if it has a zero pivot diff --git a/library/src/precond/itilu0/common.cpp b/library/src/precond/itilu0/common.cpp index c08379b8..73a47824 100644 --- a/library/src/precond/itilu0/common.cpp +++ b/library/src/precond/itilu0/common.cpp @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ - * Copyright (C) 2022 Advanced Micro Devices, Inc. + * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -149,11 +149,11 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL { if(nrm0_ != nullptr) { - atomicMax(nrm_, shared[0] / nrm0_[0]); + rocsparse_atomic_max(nrm_, shared[0] / nrm0_[0]); } else { - atomicMax(nrm_, shared[0]); + rocsparse_atomic_max(nrm_, shared[0]); } } } @@ -181,11 +181,11 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL { if(nrm0_ != nullptr) { - atomicMax(nrm_, shared[0] / nrm0_[0]); + rocsparse_atomic_max(nrm_, shared[0] / nrm0_[0]); } else { - atomicMax(nrm_, shared[0]); + rocsparse_atomic_max(nrm_, shared[0]); } } } diff --git a/library/src/precond/itilu0/rocsparse_csritilu0_async_inplace.cpp b/library/src/precond/itilu0/rocsparse_csritilu0_async_inplace.cpp index e699f0ec..3be3bd32 100644 --- a/library/src/precond/itilu0/rocsparse_csritilu0_async_inplace.cpp +++ b/library/src/precond/itilu0/rocsparse_csritilu0_async_inplace.cpp @@ -249,7 +249,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL // if(hipThreadIdx_x == 0) { - atomicMax(nrm_, nrms[0] / nrm0_[0]); + rocsparse_atomic_max(nrm_, nrms[0] / nrm0_[0]); } } } @@ -385,7 +385,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL rocsparse_blockreduce_max(hipThreadIdx_x, nrms); if(hipThreadIdx_x == 0) { - atomicMax(nrm_, nrms[0] / nrm0_[0]); + rocsparse_atomic_max(nrm_, nrms[0] / nrm0_[0]); } } } @@ -1050,7 +1050,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL rocsparse_blockreduce_sum(hipThreadIdx_x, data); if(hipThreadIdx_x == 0) { - atomicAdd(nnz_, data[0]); + rocsparse_atomic_add(nnz_, data[0]); } if(nnz_diag_ != nullptr) { @@ -1059,7 +1059,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL rocsparse_blockreduce_sum(hipThreadIdx_x, data); if(hipThreadIdx_x == 0) { - atomicAdd(nnz_diag_, data[0]); + rocsparse_atomic_add(nnz_diag_, data[0]); } } } diff --git a/library/src/precond/itilu0/rocsparse_csritilu0x_async.cpp b/library/src/precond/itilu0/rocsparse_csritilu0x_async.cpp index 91fc99ae..3aaab550 100644 --- a/library/src/precond/itilu0/rocsparse_csritilu0x_async.cpp +++ b/library/src/precond/itilu0/rocsparse_csritilu0x_async.cpp @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ - * Copyright (C) 2022 Advanced Micro Devices, Inc. + * Copyright (C) 2022-2023 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -180,7 +180,7 @@ __launch_bounds__(BLOCKSIZE) __global__ rocsparse_blockreduce_max(hipThreadIdx_x, sdata); if(hipThreadIdx_x == 0) { - atomicMax(nrm_, sdata[0] / nrm0_[0]); + rocsparse_atomic_max(nrm_, sdata[0] / nrm0_[0]); } } diff --git a/library/src/precond/itilu0/rocsparse_csritilu0x_sync.cpp b/library/src/precond/itilu0/rocsparse_csritilu0x_sync.cpp index 52a32db4..cab68223 100644 --- a/library/src/precond/itilu0/rocsparse_csritilu0x_sync.cpp +++ b/library/src/precond/itilu0/rocsparse_csritilu0x_sync.cpp @@ -134,7 +134,7 @@ __launch_bounds__(BLOCKSIZE) __global__ rocsparse_blockreduce_max(hipThreadIdx_x, sdata); if(hipThreadIdx_x == 0) { - atomicMax(nrm_, sdata[0] / nrm0_[0]); + rocsparse_atomic_max(nrm_, sdata[0] / nrm0_[0]); } } diff --git a/library/src/precond/itilu0/rocsparse_csritilu0x_sync_fusion.cpp b/library/src/precond/itilu0/rocsparse_csritilu0x_sync_fusion.cpp index 1390ee2e..0d14a59d 100644 --- a/library/src/precond/itilu0/rocsparse_csritilu0x_sync_fusion.cpp +++ b/library/src/precond/itilu0/rocsparse_csritilu0x_sync_fusion.cpp @@ -320,7 +320,7 @@ __launch_bounds__(BLOCKSIZE) __global__ iter = sdata2[0]; if(hipThreadIdx_x == 0) { - atomicMax(niter__, iter + 1); + rocsparse_atomic_max(niter__, iter + 1); } } } @@ -329,14 +329,14 @@ __launch_bounds__(BLOCKSIZE) __global__ { if(hipThreadIdx_x == 0) { - atomicMax(nrms_corr, nrminf); + rocsparse_atomic_max(nrms_corr, nrminf); } } if(compute_nrm_residual) { if(hipThreadIdx_x == 0) { - atomicMax(nrms_residual, nrminf_residual); + rocsparse_atomic_max(nrms_residual, nrminf_residual); } } }