Skip to content

Commit

Permalink
Cp/csrmv fix (#379)
Browse files Browse the repository at this point in the history
* fix performance issue on csrmv long rows (#742)

* updated changelog and version
  • Loading branch information
ntrost57 authored Mar 8, 2024
1 parent 4fb8556 commit 3f96fa5
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 34 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Documentation for rocSPARSE is available at
[https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/).

## rocSPARSE 3.1.1 for ROCm 6.1.0
## rocSPARSE 3.1.2 for ROCm 6.1.0

### Additions

Expand All @@ -15,6 +15,7 @@ Documentation for rocSPARSE is available at

* Triangular solve with multiple rhs (SpSM, csrsm, ...) now calls SpSV, csrsv, etcetera when nrhs equals 1
* Improved user manual section *Installation and Building for Linux and Windows*
* Improved SpMV in CSR format on MI300

## rocSPARSE 3.0.2 for ROCm 6.0.0

Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ if( CMAKE_CXX_COMPILER_ID MATCHES "Clang" )
endif( )

# Setup version
set(VERSION_STRING "3.1.1")
set(VERSION_STRING "3.1.2")
set(SOVERSION_STRING "1.0")

rocm_setup_version(VERSION ${VERSION_STRING})
Expand Down
93 changes: 61 additions & 32 deletions library/src/level2/csrmv_device.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*! \file */
/* ************************************************************************
* Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights Reserved.
* Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -436,23 +436,23 @@ ROCSPARSE_DEVICE_ILF void csrmvn_adaptive_device(bool conj,
// The first workgroup handles the output initialization.
Y out_val = y[row];
temp_sum = (beta - static_cast<T>(1)) * out_val;
atomicXor(&wg_flags[first_wg_in_row], 1U); // Release other workgroups.

// All inter thread communication is done using atomics, therefore cache flushes or
// invalidates should not be needed (thus __threadfence() has been removed to regain
// performance).
// Because of atomics being relaxed, however, the compiler is allowed to reorder them
// with respect to ordinary memory accesses (and other relaxed atomic operations).
// In this case, out_val seem to be reordered with the xor and subsequently, accumulation
// ends up being wrong.
// To force the compiler to stick to the order of operations, we need acquire/release fences.
// Workgroup scope is sufficient for this purpose, to only invalidate L1 and avoid L2
// invalidations.
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");

// Release other workgroups
atomicXor(&wg_flags[first_wg_in_row], 1U);
}
// For every other workgroup, wg_flags[first_wg_in_row] holds the value they wait on.
// If your flag == first_wg's flag, you spin loop.
// The first workgroup will eventually flip this flag, and you can move forward.
__threadfence();
while(gid != first_wg_in_row && lid == 0
&& ((rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U)) == compare_value))
;

// After you've passed the barrier, update your local flag to make sure that
// the next time through, you know what to wait on.
if(gid != first_wg_in_row && lid == 0)
wg_flags[gid] ^= 1U;

// All but the final workgroup in a long-row collaboration have the same start_row
// and stop_row. They only run for one iteration.
// Load in a bunch of partial results into your register space, rather than LDS (no
// contention)
// Then dump the partially reduced answers into the LDS for inter-work-item reduction.
Expand All @@ -469,8 +469,23 @@ ROCSPARSE_DEVICE_ILF void csrmvn_adaptive_device(bool conj,
// Reduce partial sums
rocsparse_blockreduce_sum<WG_SIZE>(lid, partialSums);

// For every other workgroup, wg_flags[first_wg_in_row] holds the value they wait on.
// If your flag == first_wg's flag, you spin loop.
// The first workgroup will eventually flip this flag, and you can move forward.
if(lid == 0)
{
if(gid != first_wg_in_row)
{
while(rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U) == compare_value)
;

// __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");

// After you've passed the barrier, update your local flag to make sure that
// the next time through, you know what to wait on.
wg_flags[gid] ^= 1U;
}

rocsparse_atomic_add(y + row, partialSums[0]);
}
}
Expand Down Expand Up @@ -946,24 +961,23 @@ ROCSPARSE_DEVICE_ILF void csrmvn_lrb_long_rows_device(bool conj,
// The first workgroup handles the output initialization.
Y out_val = y[row];
temp_sum = (beta - static_cast<T>(1)) * out_val;
atomicXor(&wg_flags[first_wg_in_row], 1U); // Release other workgroups.

// All inter thread communication is done using atomics, therefore cache flushes or
// invalidates should not be needed (thus __threadfence() has been removed to regain
// performance).
// Because of atomics being relaxed, however, the compiler is allowed to reorder them
// with respect to ordinary memory accesses (and other relaxed atomic operations).
// In this case, out_val seem to be reordered with the xor and subsequently, accumulation
// ends up being wrong.
// To force the compiler to stick to the order of operations, we need acquire/release fences.
// Workgroup scope is sufficient for this purpose, to only invalidate L1 and avoid L2
// invalidations.
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");

// Release other workgroups
atomicXor(&wg_flags[first_wg_in_row], 1U);
}

// For every other workgroup, wg_flags[first_wg_in_row] holds the value they wait on.
// If your flag == first_wg's flag, you spin loop.
// The first workgroup will eventually flip this flag, and you can move forward.
__threadfence();
while(gid != first_wg_in_row && lid == 0
&& ((rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U)) == compare_value))
;

// After you've passed the barrier, update your local flag to make sure that
// the next time through, you know what to wait on.
if(gid != first_wg_in_row && lid == 0)
wg_flags[gid] ^= 1U;

// All but the final workgroup in a long-row collaboration have the same start_row
// and stop_row. They only run for one iteration.
// Load in a bunch of partial results into your register space, rather than LDS (no
// contention)
// Then dump the partially reduced answers into the LDS for inter-work-item reduction.
Expand All @@ -980,8 +994,23 @@ ROCSPARSE_DEVICE_ILF void csrmvn_lrb_long_rows_device(bool conj,
// Reduce partial sums
rocsparse_blockreduce_sum<BLOCKSIZE>(lid, partialSums);

// For every other workgroup, wg_flags[first_wg_in_row] holds the value they wait on.
// If your flag == first_wg's flag, you spin loop.
// The first workgroup will eventually flip this flag, and you can move forward.
if(lid == 0)
{
if(gid != first_wg_in_row)
{
while(rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U) == compare_value)
;

// __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");

// After you've passed the barrier, update your local flag to make sure that
// the next time through, you know what to wait on.
wg_flags[gid] ^= 1U;
}

rocsparse_atomic_add((y + row), partialSums[0]);
}
}

0 comments on commit 3f96fa5

Please sign in to comment.