diff --git a/CHANGELOG.md b/CHANGELOG.md
index b5c8370d..7ed90190 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 Documentation for rocSPARSE is available at
 [https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/](https://rocm.docs.amd.com/projects/rocSPARSE/en/latest/).
 
-## rocSPARSE 3.1.1 for ROCm 6.1.0
+## rocSPARSE 3.1.2 for ROCm 6.1.0
 
 ### Additions
 
@@ -15,6 +15,7 @@ Documentation for rocSPARSE is available at
 
 * Triangular solve with multiple rhs (SpSM, csrsm, ...) now calls SpSV, csrsv, etcetera when nrhs equals 1
 * Improved user manual section *Installation and Building for Linux and Windows*
+* Improved SpMV in CSR format on MI300
 
 ## rocSPARSE 3.0.2 for ROCm 6.0.0
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 164abff3..b5ce2bc1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -179,7 +179,7 @@ if( CMAKE_CXX_COMPILER_ID MATCHES "Clang" )
 endif( )
 
 # Setup version
-set(VERSION_STRING "3.1.1")
+set(VERSION_STRING "3.1.2")
 set(SOVERSION_STRING "1.0")
 
 rocm_setup_version(VERSION ${VERSION_STRING})
diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h
index d39d410e..66e88e3b 100644
--- a/library/src/level2/csrmv_device.h
+++ b/library/src/level2/csrmv_device.h
@@ -1,6 +1,6 @@
 /*! \file */
 /* ************************************************************************
- * Copyright (C) 2018-2023 Advanced Micro Devices, Inc. All rights Reserved.
+ * Copyright (C) 2018-2024 Advanced Micro Devices, Inc. All rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -436,23 +436,23 @@ ROCSPARSE_DEVICE_ILF void csrmvn_adaptive_device(bool                 conj,
             // The first workgroup handles the output initialization.
             Y out_val = y[row];
             temp_sum  = (beta - static_cast<T>(1)) * out_val;
-            atomicXor(&wg_flags[first_wg_in_row], 1U); // Release other workgroups.
+
+            // All inter thread communication is done using atomics, therefore cache flushes or
+            // invalidates should not be needed (thus __threadfence() has been removed to regain
+            // performance).
+            // Because of atomics being relaxed, however, the compiler is allowed to reorder them
+            // with respect to ordinary memory accesses (and other relaxed atomic operations).
+            // In this case, out_val seem to be reordered with the xor and subsequently, accumulation
+            // ends up being wrong.
+            // To force the compiler to stick to the order of operations, we need acquire/release fences.
+            // Workgroup scope is sufficient for this purpose, to only invalidate L1 and avoid L2
+            // invalidations.
+            __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
+
+            // Release other workgroups
+            atomicXor(&wg_flags[first_wg_in_row], 1U);
         }
-        // For every other workgroup, wg_flags[first_wg_in_row] holds the value they wait on.
-        // If your flag == first_wg's flag, you spin loop.
-        // The first workgroup will eventually flip this flag, and you can move forward.
-        __threadfence();
-        while(gid != first_wg_in_row && lid == 0
-              && ((rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U)) == compare_value))
-            ;
-
-        // After you've passed the barrier, update your local flag to make sure that
-        // the next time through, you know what to wait on.
-        if(gid != first_wg_in_row && lid == 0)
-            wg_flags[gid] ^= 1U;
 
-        // All but the final workgroup in a long-row collaboration have the same start_row
-        // and stop_row. They only run for one iteration.
         // Load in a bunch of partial results into your register space, rather than LDS (no
         // contention)
         // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
@@ -469,8 +469,23 @@ ROCSPARSE_DEVICE_ILF void csrmvn_adaptive_device(bool                 conj,
         // Reduce partial sums
         rocsparse_blockreduce_sum<WG_SIZE>(lid, partialSums);
 
+        // For every other workgroup, wg_flags[first_wg_in_row] holds the value they wait on.
+        // If your flag == first_wg's flag, you spin loop.
+        // The first workgroup will eventually flip this flag, and you can move forward.
         if(lid == 0)
         {
+            if(gid != first_wg_in_row)
+            {
+                while(rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U) == compare_value)
+                    ;
+
+                // __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+
+                // After you've passed the barrier, update your local flag to make sure that
+                // the next time through, you know what to wait on.
+                wg_flags[gid] ^= 1U;
+            }
+
             rocsparse_atomic_add(y + row, partialSums[0]);
         }
     }
@@ -946,24 +961,23 @@ ROCSPARSE_DEVICE_ILF void csrmvn_lrb_long_rows_device(bool                 conj,
         // The first workgroup handles the output initialization.
         Y out_val = y[row];
         temp_sum  = (beta - static_cast<T>(1)) * out_val;
-        atomicXor(&wg_flags[first_wg_in_row], 1U); // Release other workgroups.
+
+        // All inter thread communication is done using atomics, therefore cache flushes or
+        // invalidates should not be needed (thus __threadfence() has been removed to regain
+        // performance).
+        // Because of atomics being relaxed, however, the compiler is allowed to reorder them
+        // with respect to ordinary memory accesses (and other relaxed atomic operations).
+        // In this case, out_val seem to be reordered with the xor and subsequently, accumulation
+        // ends up being wrong.
+        // To force the compiler to stick to the order of operations, we need acquire/release fences.
+        // Workgroup scope is sufficient for this purpose, to only invalidate L1 and avoid L2
+        // invalidations.
+        __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
+
+        // Release other workgroups
+        atomicXor(&wg_flags[first_wg_in_row], 1U);
     }
 
-    // For every other workgroup, wg_flags[first_wg_in_row] holds the value they wait on.
-    // If your flag == first_wg's flag, you spin loop.
-    // The first workgroup will eventually flip this flag, and you can move forward.
-    __threadfence();
-    while(gid != first_wg_in_row && lid == 0
-          && ((rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U)) == compare_value))
-        ;
-
-    // After you've passed the barrier, update your local flag to make sure that
-    // the next time through, you know what to wait on.
-    if(gid != first_wg_in_row && lid == 0)
-        wg_flags[gid] ^= 1U;
-
-    // All but the final workgroup in a long-row collaboration have the same start_row
-    // and stop_row. They only run for one iteration.
     // Load in a bunch of partial results into your register space, rather than LDS (no
     // contention)
     // Then dump the partially reduced answers into the LDS for inter-work-item reduction.
@@ -980,8 +994,23 @@ ROCSPARSE_DEVICE_ILF void csrmvn_lrb_long_rows_device(bool                 conj,
     // Reduce partial sums
     rocsparse_blockreduce_sum<BLOCKSIZE>(lid, partialSums);
 
+    // For every other workgroup, wg_flags[first_wg_in_row] holds the value they wait on.
+    // If your flag == first_wg's flag, you spin loop.
+    // The first workgroup will eventually flip this flag, and you can move forward.
     if(lid == 0)
     {
+        if(gid != first_wg_in_row)
+        {
+            while(rocsparse_atomic_max(&wg_flags[first_wg_in_row], 0U) == compare_value)
+                ;
+
+            // __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
+
+            // After you've passed the barrier, update your local flag to make sure that
+            // the next time through, you know what to wait on.
+            wg_flags[gid] ^= 1U;
+        }
+
         rocsparse_atomic_add((y + row), partialSums[0]);
     }
 }