diff --git a/CHANGELOG.md b/CHANGELOG.md index b5c8370d..7ed90190 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ Documentation for rocSPARSE is available at [https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/). -## rocSPARSE 3.1.1 for ROCm 6.1.0 +## rocSPARSE 3.1.2 for ROCm 6.1.0 ### Additions @@ -15,6 +15,7 @@ Documentation for rocSPARSE is available at * Triangular solve with multiple rhs (SpSM, csrsm, ...) now calls SpSV, csrsv, etcetera when nrhs equals 1 * Improved user manual section *Installation and Building for Linux and Windows* +* Improved SpMV in CSR format on MI300 ## rocSPARSE 3.0.2 for ROCm 6.0.0 diff --git a/CMakeLists.txt b/CMakeLists.txt index 164abff3..b5ce2bc1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -179,7 +179,7 @@ if( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) endif( ) # Setup version -set(VERSION_STRING "3.1.1") +set(VERSION_STRING "3.1.2") set(SOVERSION_STRING "1.0") rocm_setup_version(VERSION ${VERSION_STRING}) diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index d39d410e..66e88e3b 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -1,6 +1,6 @@ /*! \file */ /* ************************************************************************ - * Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights Reserved. + * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -436,23 +436,23 @@ ROCSPARSE_DEVICE_ILF void csrmvn_adaptive_device(bool conj, // The first workgroup handles the output initialization. Y out_val = y[row]; temp_sum = (beta - static_cast(1)) * out_val; - atomicXor(&wg_flags[first_wg_in_row], 1U); // Release other workgroups. + + // All inter thread communication is done using atomics, therefore cache flushes or + // invalidates should not be needed (thus __threadfence() has been removed to regain + // performance). + // Because of atomics being relaxed, however, the compiler is allowed to reorder them + // with respect to ordinary memory accesses (and other relaxed atomic operations). + // In this case, out_val seem to be reordered with the xor and subsequently, accumulation + // ends up being wrong. + // To force the compiler to stick to the order of operations, we need acquire/release fences. + // Workgroup scope is sufficient for this purpose, to only invalidate L1 and avoid L2 + // invalidations. + __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); + + // Release other workgroups + atomicXor(&wg_flags[first_wg_in_row], 1U); } - // For every other workgroup, wg_flags[first_wg_in_row] holds the value they wait on. - // If your flag == first_wg's flag, you spin loop. - // The first workgroup will eventually flip this flag, and you can move forward. - __threadfence(); - while(gid != first_wg_in_row && lid == 0 - && ((rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U)) == compare_value)) - ; - - // After you've passed the barrier, update your local flag to make sure that - // the next time through, you know what to wait on. - if(gid != first_wg_in_row && lid == 0) - wg_flags[gid] ^= 1U; - // All but the final workgroup in a long-row collaboration have the same start_row - // and stop_row. They only run for one iteration. // Load in a bunch of partial results into your register space, rather than LDS (no // contention) // Then dump the partially reduced answers into the LDS for inter-work-item reduction. @@ -469,8 +469,23 @@ ROCSPARSE_DEVICE_ILF void csrmvn_adaptive_device(bool conj, // Reduce partial sums rocsparse_blockreduce_sum(lid, partialSums); + // For every other workgroup, wg_flags[first_wg_in_row] holds the value they wait on. + // If your flag == first_wg's flag, you spin loop. + // The first workgroup will eventually flip this flag, and you can move forward. if(lid == 0) { + if(gid != first_wg_in_row) + { + while(rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U) == compare_value) + ; + + // __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); + + // After you've passed the barrier, update your local flag to make sure that + // the next time through, you know what to wait on. + wg_flags[gid] ^= 1U; + } + rocsparse_atomic_add(y + row, partialSums[0]); } } @@ -946,24 +961,23 @@ ROCSPARSE_DEVICE_ILF void csrmvn_lrb_long_rows_device(bool conj, // The first workgroup handles the output initialization. Y out_val = y[row]; temp_sum = (beta - static_cast(1)) * out_val; - atomicXor(&wg_flags[first_wg_in_row], 1U); // Release other workgroups. + + // All inter thread communication is done using atomics, therefore cache flushes or + // invalidates should not be needed (thus __threadfence() has been removed to regain + // performance). + // Because of atomics being relaxed, however, the compiler is allowed to reorder them + // with respect to ordinary memory accesses (and other relaxed atomic operations). + // In this case, out_val seem to be reordered with the xor and subsequently, accumulation + // ends up being wrong. + // To force the compiler to stick to the order of operations, we need acquire/release fences. + // Workgroup scope is sufficient for this purpose, to only invalidate L1 and avoid L2 + // invalidations. + __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup"); + + // Release other workgroups + atomicXor(&wg_flags[first_wg_in_row], 1U); } - // For every other workgroup, wg_flags[first_wg_in_row] holds the value they wait on. - // If your flag == first_wg's flag, you spin loop. - // The first workgroup will eventually flip this flag, and you can move forward. - __threadfence(); - while(gid != first_wg_in_row && lid == 0 - && ((rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U)) == compare_value)) - ; - - // After you've passed the barrier, update your local flag to make sure that - // the next time through, you know what to wait on. - if(gid != first_wg_in_row && lid == 0) - wg_flags[gid] ^= 1U; - - // All but the final workgroup in a long-row collaboration have the same start_row - // and stop_row. They only run for one iteration. // Load in a bunch of partial results into your register space, rather than LDS (no // contention) // Then dump the partially reduced answers into the LDS for inter-work-item reduction. @@ -980,8 +994,23 @@ ROCSPARSE_DEVICE_ILF void csrmvn_lrb_long_rows_device(bool conj, // Reduce partial sums rocsparse_blockreduce_sum(lid, partialSums); + // For every other workgroup, wg_flags[first_wg_in_row] holds the value they wait on. + // If your flag == first_wg's flag, you spin loop. + // The first workgroup will eventually flip this flag, and you can move forward. if(lid == 0) { + if(gid != first_wg_in_row) + { + while(rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U) == compare_value) + ; + + // __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup"); + + // After you've passed the barrier, update your local flag to make sure that + // the next time through, you know what to wait on. + wg_flags[gid] ^= 1U; + } + rocsparse_atomic_add((y + row), partialSums[0]); } }