diff --git a/.clang-format b/.clang-format
index 0395753c8..605a54fdc 100644
--- a/.clang-format
+++ b/.clang-format
@@ -58,8 +58,6 @@ BraceWrapping:
     AfterNamespace: true
     AfterStruct: true
     AfterUnion: true
-    BeforeCatch: true
-    BeforeElse: true
     AfterExternBlock: false
     BeforeCatch: true
     BeforeElse: true
diff --git a/.gitignore b/.gitignore
index 3bee42a2a..42ae29137 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,14 @@
 ### Build dirs ###
-build/
+build*/
+
+### clangd. ### 
+/.cache
+
+### Docs dirs ###
+doc/html/
+doc/xml/
+doc/latex/
+doc/*.tag
 
 # Created by https://www.gitignore.io/api/c++,cmake
 
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 979c4af77..0229df12c 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -230,9 +230,7 @@ build:package:
 
 build:benchmark:
   stage: build
-  needs:
-    - job: "autotune:generate-config"
-      optional: true
+  needs: []
   tags:
     - rocm-build
   extends:
@@ -270,7 +268,7 @@ autotune:build:
   extends:
     - .cmake-minimum
     - .gpus:rocm-gpus
-    - .rules:manual
+    - .rules:benchmark
   variables:
     BENCHMARK_TARGETS: benchmark_config_tuning
   script:
@@ -282,6 +280,7 @@ autotune:build:
       -S $CI_PROJECT_DIR
       -G Ninja
       -D CMAKE_CXX_COMPILER="$AMDCLANG"
+      -D CMAKE_CXX_FLAGS="-Wno-#pragma-messages"
       -D CMAKE_BUILD_TYPE=Release
       -D BUILD_TEST=OFF
       -D BUILD_EXAMPLE=OFF
@@ -472,11 +471,7 @@ autotune:execute-tuning:
     # Exclude benchmark that is known to fail on gfx906
     # On ROCm 5.7 or later, check if this can be removed - the presumption is that the failure is caused by a compiler issue.
     - >
-      if [[ "${GPU_TARGET}" == "gfx906" ]] && [[ "${AUTOTUNE_ALGORITHM_REGEX}" == "" ]]; then
-        export AUTOTUNE_ALGORITHM_REGEX="-\{\"lvl\":\"device\",\"algo\":\"radix_sort_onesweep\",\"key_type\":\"short\",\"value_type\":\"short\",\"cfg\":\{\"histogram\":\{\"bs\":1024,\"ipt\":22},\"sort\":\{\"bs\":1024,\"ipt\":22},\"bits_per_place\":5,\"algorithm\":\"block_radix_rank_algorithm::match\"}}"
-      fi
-    - 'printf "CI Variables used in benchmarks:\nAUTOTUNE_RESULT_DIR: %s\nAUTOTUNE_FILENAME_REGEX: %s\nAUTOTUNE_ALGORITHM_REGEX: %s \nAUTOTUNE_SIZE: %s \nAUTOTUNE_TRIALS: %s\n" "$AUTOTUNE_RESULT_DIR" "$AUTOTUNE_FILENAME_REGEX" "$AUTOTUNE_ALGORITHM_REGEX" "$AUTOTUNE_SIZE" "$AUTOTUNE_TRIALS"'
-    - cd "${CI_PROJECT_DIR}"
+      cd "${CI_PROJECT_DIR}"
     - mkdir -p "${AUTOTUNE_RESULT_DIR}"
     - python3
       .gitlab/run_benchmarks.py
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ab972ba62..693593bc7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,19 +2,39 @@
 
 Full documentation for rocPRIM is available at [https://rocprim.readthedocs.io/en/latest/](https://rocprim.readthedocs.io/en/latest/)
 
+## [Unreleased rocPRIM-3.0.0 for ROCm 6.1.0]
+### Added
+ - Added new primitive: `block_run_length_decode`.
+### Changed
+- Removed deprecated functionality: `reduce_by_key_config`, `MatchAny`, `scan_config`, `scan_by_key_config` and `radix_sort_config`.
+- Renamed `scan_config_v2` to `scan_config`, `scan_by_key_config_v2` to `scan_by_key_config`, `radix_sort_config_v2` to `radix_sort_config`, `reduce_by_key_config_v2` to `reduce_by_key_config`, `radix_sort_config_v2` to `radix_sort_config`.
+- Removed support for custom config types for device algorithms.
+- `host_warp_size()` was moved into `rocprim/device/config_types.hpp`, and now uses either a `device_id` or a `stream` parameter to query the proper device and a `device_id` out parameter. The return type is `hipError_t`.
+- Added support for __int128_t in `device_radix_sort` and `block_radix_sort`.
+### Fixed
+- Fixed build issues with `rmake.py` on Windows when using VS 2017 15.8 or later due to a breaking fix with extended aligned storage.
+
 ## [Unreleased rocPRIM-2.13.1 for ROCm 5.7.0]
 ### Added
 - `block_sort::sort()` overload for keys and values with a dynamic size, for all block sort algorithms. Additionally, all `block_sort::sort()` overloads with a dynamic size are now supported for `block_sort_algorithm::merge_sort` and `block_sort_algorithm::bitonic_sort`.
 - New two-way partition primitive `partition_two_way` which can write to two separate iterators.
+- Added config tuning and dynamic dispatch to `device_adjacent_difference` algorithm
+- New `rocprim::group_elect` warp intrinsic, which chooses one lane from the lanes enabled by a mask.
 ### Changed
 - Deprecated configuration `radix_sort_config` for device-level radix sort as it no longer matches the algorithm's parameters. New configuration `radix_sort_config_v2` is preferred instead.
 - Removed erroneous implementation of device-level `inclusive_scan` and `exclusive_scan`. The prior default implementation using lookback-scan now is the only available implementation.
 - The benchmark metric indicating the bytes processed for `exclusive_scan_by_key` and `inclusive_scan_by_key` has been changed to incorporate the key type. Furthermore, the benchmark log has been changed such that these algorithms are reported as `scan` and `scan_by_key` instead of `scan_exclusive` and `scan_inclusive`.
 - Deprecated configurations `scan_config` and `scan_by_key_config` for device-level scans, as they no longer match the algorithm's parameters. New configurations `scan_config_v2` and `scan_by_key_config_v2` are preferred instead.
 - Improved the performance of `partition`.
+- `merge_sort_block_sort` will always use stable merge sort as it is faster than the fallback implementation.
+- The `rocprim::match_any` interface has a new parameter, `valid` to enalble/disable lanes. The default value is true, so it doesn't change the previous behaviour. 
 ### Fixed
 - Fixed build issue caused by missing header in `thread/thread_search.hpp`.
 - Fixed `rocprim::MatchAny` for devices with 64-bit warp size. The function `rocprim::MatchAny` is deprecated and `rocprim::match_any` is preferred instead.
+- Fixed `device_adjacent_difference` using more shared memory than required.
+- Fixed a compilation error when `ROCPRIM_DISABLE_DPP` is defined.
+- rocPRIM should be more robust for detecting GPU architecture features. Explicitly listing each architecture is no longer required by developers, fixing compilation failures when
+  targeting devices not known by rocPRIM.
 
 ## [rocPRIM-2.13.0 for ROCm 5.5.0]
 ### Added
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index c65de966c..8087b43ff 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -123,6 +123,7 @@ add_rocprim_benchmark(benchmark_block_histogram.cpp)
 add_rocprim_benchmark(benchmark_block_radix_sort.cpp)
 add_rocprim_benchmark(benchmark_block_radix_rank.cpp)
 add_rocprim_benchmark(benchmark_block_reduce.cpp)
+add_rocprim_benchmark(benchmark_block_run_length_decode.cpp)
 add_rocprim_benchmark(benchmark_block_scan.cpp)
 add_rocprim_benchmark(benchmark_block_sort.cpp)
 add_rocprim_benchmark(benchmark_config_dispatch.cpp)
diff --git a/benchmark/ConfigAutotuneSettings.cmake b/benchmark/ConfigAutotuneSettings.cmake
index d1fcd2490..510c222ad 100644
--- a/benchmark/ConfigAutotuneSettings.cmake
+++ b/benchmark/ConfigAutotuneSettings.cmake
@@ -29,10 +29,10 @@ set(LIMITED_TUNING_TYPES "int64_t int short int8_t")
 
 function(read_config_autotune_settings file list_across_names list_across output_pattern_suffix)
   if(file STREQUAL "benchmark_device_adjacent_difference")
-    set(list_across_names "DataType;Left;InPlace;BlockSize;ItemsPerThread" PARENT_SCOPE)
+    set(list_across_names "DataType;Left;InPlace;BlockSize" PARENT_SCOPE)
     set(list_across "${TUNING_TYPES};\
-true false;true false;64 128;1 2 4 8 16" PARENT_SCOPE)
-    set(output_pattern_suffix "@DataType@_@Left@_@InPlace@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE)
+true;false true;32 64 128 256 512 1024" PARENT_SCOPE)
+    set(output_pattern_suffix "@DataType@_@Left@_@InPlace@_@BlockSize@" PARENT_SCOPE)
   elseif(file STREQUAL "benchmark_device_histogram")
     set(list_across_names "DataType;BlockSize" PARENT_SCOPE)
     set(list_across "${TUNING_TYPES};64 128 256" PARENT_SCOPE)
diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp
new file mode 100644
index 000000000..04e1f0428
--- /dev/null
+++ b/benchmark/benchmark_block_run_length_decode.cpp
@@ -0,0 +1,242 @@
+// MIT License
+//
+// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "benchmark/benchmark.h"
+#include "benchmark_utils.hpp"
+#include "cmdparser.hpp"
+
+#include "rocprim/block/block_load.hpp"
+#include "rocprim/block/block_run_length_decode.hpp"
+#include "rocprim/block/block_store.hpp"
+
+#include <random>
+#include <vector>
+
+#ifndef DEFAULT_N
+const size_t DEFAULT_N = 1024 * 1024 * 32;
+#endif
+
+template<class ItemT,
+         class OffsetT,
+         unsigned BlockSize,
+         unsigned RunsPerThread,
+         unsigned DecodedItemsPerThread,
+         unsigned Trials>
+__global__
+    __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT*   d_run_items,
+                                                                     const OffsetT* d_run_offsets,
+                                                                     ItemT*         d_decoded_items,
+                                                                     bool enable_store = false)
+{
+    using BlockRunLengthDecodeT
+        = rocprim::block_run_length_decode<ItemT, BlockSize, RunsPerThread, DecodedItemsPerThread>;
+
+    ItemT   run_items[RunsPerThread];
+    OffsetT run_offsets[RunsPerThread];
+
+    const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x;
+    rocprim::block_load_direct_blocked(global_thread_idx, d_run_items, run_items);
+    rocprim::block_load_direct_blocked(global_thread_idx, d_run_offsets, run_offsets);
+
+    ROCPRIM_SHARED_MEMORY typename BlockRunLengthDecodeT::storage_type temp_storage;
+    BlockRunLengthDecodeT block_run_length_decode(run_items, run_offsets);
+
+    const OffsetT total_decoded_size
+        = d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread]
+          - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread];
+
+#pragma nounroll
+    for(unsigned i = 0; i < Trials; ++i)
+    {
+        OffsetT decoded_window_offset = 0;
+        while(decoded_window_offset < total_decoded_size)
+        {
+            ItemT decoded_items[DecodedItemsPerThread];
+            block_run_length_decode.run_length_decode(decoded_items, decoded_window_offset);
+
+            if(enable_store)
+            {
+                rocprim::block_store_direct_blocked(global_thread_idx,
+                                                    d_decoded_items + decoded_window_offset,
+                                                    decoded_items);
+            }
+
+            decoded_window_offset += BlockSize * DecodedItemsPerThread;
+        }
+    }
+}
+
+template<class ItemT,
+         class OffsetT,
+         unsigned MinRunLength,
+         unsigned MaxRunLength,
+         unsigned BlockSize,
+         unsigned RunsPerThread,
+         unsigned DecodedItemsPerThread,
+         unsigned Trials = 100>
+void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
+{
+    constexpr auto runs_per_block  = BlockSize * RunsPerThread;
+    const auto     target_num_runs = 2 * N / (MinRunLength + MaxRunLength);
+    const auto     num_runs
+        = runs_per_block * ((target_num_runs + runs_per_block - 1) / runs_per_block);
+
+    std::vector<ItemT>   run_items(num_runs);
+    std::vector<OffsetT> run_offsets(num_runs + 1);
+
+    std::default_random_engine prng(std::random_device{}());
+    using ItemDistribution = std::conditional_t<std::is_integral<ItemT>::value,
+                                                std::uniform_int_distribution<ItemT>,
+                                                std::uniform_real_distribution<ItemT>>;
+    ItemDistribution                       run_item_dist(0, 100);
+    std::uniform_int_distribution<OffsetT> run_length_dist(MinRunLength, MaxRunLength);
+
+    for(size_t i = 0; i < num_runs; ++i)
+    {
+        run_items[i] = run_item_dist(prng);
+    }
+    for(size_t i = 1; i < num_runs + 1; ++i)
+    {
+        const OffsetT next_run_length = run_length_dist(prng);
+        run_offsets[i]                = run_offsets[i - 1] + next_run_length;
+    }
+    const OffsetT output_length = run_offsets.back();
+
+    ItemT* d_run_items{};
+    HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT)));
+    HIP_CHECK(hipMemcpy(d_run_items,
+                        run_items.data(),
+                        run_items.size() * sizeof(ItemT),
+                        hipMemcpyHostToDevice));
+
+    OffsetT* d_run_offsets{};
+    HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT)));
+    HIP_CHECK(hipMemcpy(d_run_offsets,
+                        run_offsets.data(),
+                        run_offsets.size() * sizeof(OffsetT),
+                        hipMemcpyHostToDevice));
+
+    ItemT* d_output{};
+    HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT)));
+
+    for(auto _ : state)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(block_run_length_decode_kernel<ItemT,
+                                                                          OffsetT,
+                                                                          BlockSize,
+                                                                          RunsPerThread,
+                                                                          DecodedItemsPerThread,
+                                                                          Trials>),
+                           dim3(num_runs / runs_per_block),
+                           dim3(BlockSize),
+                           0,
+                           stream,
+                           d_run_items,
+                           d_run_offsets,
+                           d_output);
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+
+        state.SetIterationTime(elapsed_seconds.count());
+    }
+    state.SetBytesProcessed(state.iterations() * output_length * sizeof(ItemT) * Trials);
+    state.SetItemsProcessed(state.iterations() * output_length * Trials);
+
+    HIP_CHECK(hipFree(d_run_items));
+    HIP_CHECK(hipFree(d_run_offsets));
+    HIP_CHECK(hipFree(d_output));
+}
+
+#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT)                                 \
+    benchmark::RegisterBenchmark("block_run_length_decode<Item Type:" #IT ",Offset Type:" #OT \
+                                 ",Min RunLength:" #MINRL ",Max RunLength:" #MAXRL            \
+                                 ",BlockSize: " #BS ",Runs Per Thread:" #RPT                  \
+                                 ",Decoded Items Per Thread:" #DIPT ">",                      \
+                                 &run_benchmark<IT, OT, MINRL, MAXRL, BS, RPT, DIPT>,         \
+                                 stream,                                                      \
+                                 size)
+
+int main(int argc, char* argv[])
+{
+    cli::Parser parser(argc, argv);
+    parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
+    parser.set_optional<int>("trials", "trials", -1, "number of iterations");
+    parser.run_and_exit_if_error();
+
+    // Parse argv
+    benchmark::Initialize(&argc, argv);
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
+
+    std::cout << "benchmark_block_run_length_decode" << std::endl;
+
+    // HIP
+    hipStream_t     stream = 0; // default
+    hipDeviceProp_t devProp;
+    int             device_id = 0;
+    HIP_CHECK(hipGetDevice(&device_id));
+    HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
+    std::cout << "[HIP] Device name: " << devProp.name << std::endl;
+
+    // Add benchmarks
+    std::vector<benchmark::internal::Benchmark*> benchmarks{
+        CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4),
+        CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4),
+        CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4),
+        CREATE_BENCHMARK(int, int, 1, 100, 128, 2, 4),
+        CREATE_BENCHMARK(int, int, 1, 500, 128, 2, 4),
+        CREATE_BENCHMARK(int, int, 1, 1000, 128, 2, 4),
+        CREATE_BENCHMARK(int, int, 1, 5000, 128, 2, 4),
+
+        CREATE_BENCHMARK(double, long long, 1, 5, 128, 2, 4),
+        CREATE_BENCHMARK(double, long long, 1, 10, 128, 2, 4),
+        CREATE_BENCHMARK(double, long long, 1, 50, 128, 2, 4),
+        CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4),
+        CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4),
+        CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4),
+        CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4)};
+
+    // Use manual timing
+    for(auto& b : benchmarks)
+    {
+        b->UseManualTime();
+        b->Unit(benchmark::kMillisecond);
+    }
+
+    // Force number of iterations
+    if(trials > 0)
+    {
+        for(auto& b : benchmarks)
+        {
+            b->Iterations(trials);
+        }
+    }
+
+    // Run benchmarks
+    benchmark::RunSpecifiedBenchmarks();
+    return 0;
+}
diff --git a/benchmark/benchmark_device_adjacent_difference.parallel.cpp.in b/benchmark/benchmark_device_adjacent_difference.parallel.cpp.in
index 0892d2660..03c316d5b 100644
--- a/benchmark/benchmark_device_adjacent_difference.parallel.cpp.in
+++ b/benchmark/benchmark_device_adjacent_difference.parallel.cpp.in
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -26,9 +26,11 @@
 #include "benchmark_device_adjacent_difference.parallel.hpp"
 
 namespace {
-    auto benchmarks = config_autotune_register::create<device_adjacent_difference_benchmark<
+    auto benchmarks = config_autotune_register::create_bulk(
+        device_adjacent_difference_benchmark_generator< 
         @DataType@, 
+        @BlockSize@,
         @Left@, 
-        @InPlace@,
-        rocprim::adjacent_difference_config<@BlockSize@, @ItemsPerThread@>>>();
+        @InPlace@>::create);
+
 }
diff --git a/benchmark/benchmark_device_adjacent_difference.parallel.hpp b/benchmark/benchmark_device_adjacent_difference.parallel.hpp
index b74de4050..51f8cdfda 100644
--- a/benchmark/benchmark_device_adjacent_difference.parallel.hpp
+++ b/benchmark/benchmark_device_adjacent_difference.parallel.hpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -39,21 +39,36 @@
 
 #include "benchmark_utils.hpp"
 
-template<typename T    = int,
-         bool left     = false,
-         bool in_place = false,
-         typename Config
-         = rocprim::detail::default_adjacent_difference_config<ROCPRIM_TARGET_ARCH, T>>
+template<typename Config>
+std::string config_name()
+{
+    //const rocprim::adjacent_difference_config = Config();
+    auto config = Config();
+    return "{bs:" + std::to_string(config.block_size)
+           + ",ipt:" + std::to_string(config.items_per_thread) + "}";
+}
+
+template<>
+inline std::string config_name<rocprim::default_config>()
+{
+    return "default_config";
+}
+
+template<typename T      = int,
+         bool Left       = false,
+         bool InPlace    = false,
+         typename Config = rocprim::default_config>
 struct device_adjacent_difference_benchmark : public config_autotune_interface
 {
+
     std::string name() const override
     {
+
         using namespace std::string_literals;
-        return bench_naming::format_name(
-            "{lvl:device,algo:adjacent_difference" + (left ? ""s : "_right"s)
-            + (in_place ? "_inplace"s : ""s) + ",key_type:" + std::string(Traits<T>::name())
-            + ",cfg:{bs:" + std::to_string(Config::block_size)
-            + ",ipt:" + std::to_string(Config::items_per_thread) + "}}");
+        return bench_naming::format_name("{lvl:device,algo:adjacent_difference"
+                                         + (Left ? ""s : "_right"s) + (InPlace ? "_inplace"s : ""s)
+                                         + ",value_type:" + std::string(Traits<T>::name())
+                                         + ",cfg:" + config_name<Config>() + "}");
     }
 
     static constexpr unsigned int batch_size  = 10;
@@ -84,11 +99,11 @@ struct device_adjacent_difference_benchmark : public config_autotune_interface
                                       const OutputIt output,
                                       Args&&... args) const
     {
-        return ::rocprim::adjacent_difference_right(temporary_storage,
-                                                    storage_size,
-                                                    input,
-                                                    output,
-                                                    std::forward<Args>(args)...);
+        return ::rocprim::adjacent_difference_right<Config>(temporary_storage,
+                                                            storage_size,
+                                                            input,
+                                                            output,
+                                                            std::forward<Args>(args)...);
     }
 
     template<typename InputIt, typename OutputIt, typename... Args>
@@ -140,13 +155,13 @@ struct device_adjacent_difference_benchmark : public config_autotune_interface
                             input.size() * sizeof(input[0]),
                             hipMemcpyHostToDevice));
 
-        if(!in_place)
+        if(!InPlace)
         {
             HIP_CHECK(hipMalloc(&d_output, size * sizeof(output_type)));
         }
 
-        static constexpr auto left_tag     = rocprim::detail::bool_constant<left>{};
-        static constexpr auto in_place_tag = rocprim::detail::bool_constant<in_place>{};
+        static constexpr auto left_tag     = rocprim::detail::bool_constant<Left>{};
+        static constexpr auto in_place_tag = rocprim::detail::bool_constant<InPlace>{};
 
         // Allocate temporary storage
         std::size_t temp_storage_size;
@@ -208,7 +223,7 @@ struct device_adjacent_difference_benchmark : public config_autotune_interface
         state.SetItemsProcessed(state.iterations() * batch_size * size);
 
         hipFree(d_input);
-        if(!in_place)
+        if(!InPlace)
         {
             hipFree(d_output);
         }
@@ -216,4 +231,34 @@ struct device_adjacent_difference_benchmark : public config_autotune_interface
     }
 };
 
+template<typename T, unsigned int BlockSize, bool Left, bool InPlace>
+struct device_adjacent_difference_benchmark_generator
+{
+
+    template<unsigned int ItemsPerThread>
+    struct create_ipt
+    {
+        using generated_config
+            = rocprim::adjacent_difference_config<BlockSize, 1 << ItemsPerThread>;
+
+        void operator()(std::vector<std::unique_ptr<config_autotune_interface>>& storage)
+        {
+            storage.emplace_back(
+                std::make_unique<
+                    device_adjacent_difference_benchmark<T, Left, InPlace, generated_config>>());
+        }
+    };
+
+    static void create(std::vector<std::unique_ptr<config_autotune_interface>>& storage)
+    {
+        static constexpr unsigned int min_items_per_thread = 1;
+        static constexpr unsigned int max_items_per_thread_arg
+            = TUNING_SHARED_MEMORY_MAX / (BlockSize * sizeof(T) * 2 + sizeof(T));
+        static constexpr unsigned int max_items_per_thread
+            = rocprim::Log2<max_items_per_thread_arg>::VALUE - 1;
+        static_for_each<make_index_range<unsigned int, min_items_per_thread, max_items_per_thread>,
+                        create_ipt>(storage);
+    }
+};
+
 #endif // ROCPRIM_BENCHMARK_DEVICE_ADJACENT_DIFFERENCE_PARALLEL_HPP_
diff --git a/benchmark/benchmark_device_binary_search.cpp b/benchmark/benchmark_device_binary_search.cpp
index 242619b8c..77353b6a1 100644
--- a/benchmark/benchmark_device_binary_search.cpp
+++ b/benchmark/benchmark_device_binary_search.cpp
@@ -40,6 +40,7 @@
 #include <rocprim/rocprim.hpp>
 
 #include "benchmark_device_binary_search.parallel.hpp"
+#include "rocprim/device/config_types.hpp"
 
 #ifndef DEFAULT_N
 const size_t DEFAULT_N = 1024 * 1024 * 32;
@@ -96,32 +97,33 @@ void run_benchmark(benchmark::State& state,
 
     void * d_temporary_storage = nullptr;
     size_t temporary_storage_bytes;
-    HIP_CHECK(dispatch_binary_search(AlgorithmSelectorTag{},
-                                     d_temporary_storage,
-                                     temporary_storage_bytes,
-                                     d_haystack,
-                                     d_needles,
-                                     d_output,
-                                     haystack_size,
-                                     needles_size,
-                                     compare_op,
-                                     stream));
+    auto   dispatch_helper = dispatch_binary_search_helper<rocprim::default_config>();
+    HIP_CHECK(dispatch_helper.dispatch_binary_search(AlgorithmSelectorTag{},
+                                                     d_temporary_storage,
+                                                     temporary_storage_bytes,
+                                                     d_haystack,
+                                                     d_needles,
+                                                     d_output,
+                                                     haystack_size,
+                                                     needles_size,
+                                                     compare_op,
+                                                     stream));
 
     HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes));
 
     // Warm-up
     for(size_t i = 0; i < warmup_size; i++)
     {
-        HIP_CHECK(dispatch_binary_search(AlgorithmSelectorTag{},
-                                         d_temporary_storage,
-                                         temporary_storage_bytes,
-                                         d_haystack,
-                                         d_needles,
-                                         d_output,
-                                         haystack_size,
-                                         needles_size,
-                                         compare_op,
-                                         stream));
+        HIP_CHECK(dispatch_helper.dispatch_binary_search(AlgorithmSelectorTag{},
+                                                         d_temporary_storage,
+                                                         temporary_storage_bytes,
+                                                         d_haystack,
+                                                         d_needles,
+                                                         d_output,
+                                                         haystack_size,
+                                                         needles_size,
+                                                         compare_op,
+                                                         stream));
     }
     HIP_CHECK(hipDeviceSynchronize());
 
@@ -137,16 +139,16 @@ void run_benchmark(benchmark::State& state,
 
         for(size_t i = 0; i < batch_size; i++)
         {
-            HIP_CHECK(dispatch_binary_search(AlgorithmSelectorTag{},
-                                             d_temporary_storage,
-                                             temporary_storage_bytes,
-                                             d_haystack,
-                                             d_needles,
-                                             d_output,
-                                             haystack_size,
-                                             needles_size,
-                                             compare_op,
-                                             stream));
+            HIP_CHECK(dispatch_helper.dispatch_binary_search(AlgorithmSelectorTag{},
+                                                             d_temporary_storage,
+                                                             temporary_storage_bytes,
+                                                             d_haystack,
+                                                             d_needles,
+                                                             d_output,
+                                                             haystack_size,
+                                                             needles_size,
+                                                             compare_op,
+                                                             stream));
         }
 
         // Record stop event and wait until it completes
diff --git a/benchmark/benchmark_device_binary_search.parallel.hpp b/benchmark/benchmark_device_binary_search.parallel.hpp
index fdc3d5d27..cb9ff3c79 100644
--- a/benchmark/benchmark_device_binary_search.parallel.hpp
+++ b/benchmark/benchmark_device_binary_search.parallel.hpp
@@ -28,6 +28,8 @@
 #include <vector>
 
 #include "benchmark_utils.hpp"
+#include "rocprim/device/config_types.hpp"
+#include "rocprim/device/detail/device_config_helper.hpp"
 #include <benchmark/benchmark.h>
 #include <hip/hip_runtime_api.h>
 #include <rocprim/device/device_binary_search.hpp>
@@ -56,23 +58,52 @@ struct upper_bound_subalgorithm
     }
 };
 
-template<class Config = rocprim::default_config, class... Args>
-hipError_t dispatch_binary_search(binary_search_subalgorithm, Args&&... args)
+template<class Config = rocprim::default_config>
+struct dispatch_binary_search_helper
 {
-    return rocprim::binary_search<Config>(std::forward<Args>(args)...);
-}
+    template<class... Args>
+    hipError_t dispatch_binary_search(binary_search_subalgorithm, Args&&... args)
+    {
+        using config = rocprim::binary_search_config<Config::block_size, Config::items_per_thread>;
+        return rocprim::binary_search<config>(std::forward<Args>(args)...);
+    }
 
-template<class Config = rocprim::default_config, class... Args>
-hipError_t dispatch_binary_search(upper_bound_subalgorithm, Args&&... args)
-{
-    return rocprim::upper_bound<Config>(std::forward<Args>(args)...);
-}
+    template<class... Args>
+    hipError_t dispatch_binary_search(upper_bound_subalgorithm, Args&&... args)
+    {
+        using config = rocprim::upper_bound_config<Config::block_size, Config::items_per_thread>;
+        return rocprim::upper_bound<config>(std::forward<Args>(args)...);
+    }
+
+    template<class... Args>
+    hipError_t dispatch_binary_search(lower_bound_subalgorithm, Args&&... args)
+    {
+        using config = rocprim::lower_bound_config<Config::block_size, Config::items_per_thread>;
+        return rocprim::lower_bound<config>(std::forward<Args>(args)...);
+    }
+};
 
-template<class Config = rocprim::default_config, class... Args>
-hipError_t dispatch_binary_search(lower_bound_subalgorithm, Args&&... args)
+template<>
+struct dispatch_binary_search_helper<rocprim::default_config>
 {
-    return rocprim::lower_bound<Config>(std::forward<Args>(args)...);
-}
+    template<class... Args>
+    hipError_t dispatch_binary_search(binary_search_subalgorithm, Args&&... args)
+    {
+        return rocprim::binary_search<rocprim::default_config>(std::forward<Args>(args)...);
+    }
+
+    template<class... Args>
+    hipError_t dispatch_binary_search(upper_bound_subalgorithm, Args&&... args)
+    {
+        return rocprim::upper_bound<rocprim::default_config>(std::forward<Args>(args)...);
+    }
+
+    template<class... Args>
+    hipError_t dispatch_binary_search(lower_bound_subalgorithm, Args&&... args)
+    {
+        return rocprim::lower_bound<rocprim::default_config>(std::forward<Args>(args)...);
+    }
+};
 
 template<class SubAlgorithm, class T, class OutputType, class Config>
 struct device_binary_search_benchmark : public config_autotune_interface
@@ -116,30 +147,31 @@ struct device_binary_search_benchmark : public config_autotune_interface
 
         void*  d_temporary_storage = nullptr;
         size_t temporary_storage_bytes;
-        HIP_CHECK(dispatch_binary_search<Config>(SubAlgorithm{},
-                                                 d_temporary_storage,
-                                                 temporary_storage_bytes,
-                                                 d_haystack,
-                                                 d_needles,
-                                                 d_output,
-                                                 haystack_size,
-                                                 needles_size,
-                                                 compare_op,
-                                                 stream));
+        auto   dispatch_helper = dispatch_binary_search_helper<Config>();
+        HIP_CHECK(dispatch_helper.dispatch_binary_search(SubAlgorithm{},
+                                                         d_temporary_storage,
+                                                         temporary_storage_bytes,
+                                                         d_haystack,
+                                                         d_needles,
+                                                         d_output,
+                                                         haystack_size,
+                                                         needles_size,
+                                                         compare_op,
+                                                         stream));
 
         HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes));
 
         // Warm-up
-        HIP_CHECK(dispatch_binary_search<Config>(SubAlgorithm{},
-                                                 d_temporary_storage,
-                                                 temporary_storage_bytes,
-                                                 d_haystack,
-                                                 d_needles,
-                                                 d_output,
-                                                 haystack_size,
-                                                 needles_size,
-                                                 compare_op,
-                                                 stream));
+        HIP_CHECK(dispatch_helper.dispatch_binary_search(SubAlgorithm{},
+                                                         d_temporary_storage,
+                                                         temporary_storage_bytes,
+                                                         d_haystack,
+                                                         d_needles,
+                                                         d_output,
+                                                         haystack_size,
+                                                         needles_size,
+                                                         compare_op,
+                                                         stream));
         HIP_CHECK(hipDeviceSynchronize());
 
         // HIP events creation
@@ -152,16 +184,16 @@ struct device_binary_search_benchmark : public config_autotune_interface
             // Record start event
             HIP_CHECK(hipEventRecord(start, stream));
 
-            HIP_CHECK(dispatch_binary_search<Config>(SubAlgorithm{},
-                                                     d_temporary_storage,
-                                                     temporary_storage_bytes,
-                                                     d_haystack,
-                                                     d_needles,
-                                                     d_output,
-                                                     haystack_size,
-                                                     needles_size,
-                                                     compare_op,
-                                                     stream));
+            HIP_CHECK(dispatch_helper.dispatch_binary_search(SubAlgorithm{},
+                                                             d_temporary_storage,
+                                                             temporary_storage_bytes,
+                                                             d_haystack,
+                                                             d_needles,
+                                                             d_output,
+                                                             haystack_size,
+                                                             needles_size,
+                                                             compare_op,
+                                                             stream));
 
             // Record stop event and wait until it completes
             HIP_CHECK(hipEventRecord(stop, stream));
diff --git a/benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp b/benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp
index e744eb060..57ca12ae3 100644
--- a/benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp
+++ b/benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp
@@ -59,8 +59,7 @@ std::string config_name()
 {
     const rocprim::detail::merge_sort_block_sort_config_params config = Config();
     return "{bs:" + std::to_string(config.block_sort_config.block_size)
-           + ",ipt:" + std::to_string(config.block_sort_config.items_per_thread)
-           + ",method:" + std::string(get_block_sort_method_name(config.block_sort_method)) + "}";
+           + ",ipt:" + std::to_string(config.block_sort_config.items_per_thread) + "}";
 }
 
 template<>
diff --git a/benchmark/benchmark_device_scan.parallel.hpp b/benchmark/benchmark_device_scan.parallel.hpp
index 9eb3b59a1..4f976d4c4 100644
--- a/benchmark/benchmark_device_scan.parallel.hpp
+++ b/benchmark/benchmark_device_scan.parallel.hpp
@@ -222,12 +222,12 @@ struct device_scan_benchmark_generator
             {
                 void operator()(std::vector<std::unique_ptr<config_autotune_interface>>& storage)
                 {
-                    storage.emplace_back(std::make_unique<device_scan_benchmark<
-                                             false,
-                                             T,
-                                             rocprim::plus<T>,
-                                             rocprim::scan_config_v2<
-                                                 block_size,
+                    storage.emplace_back(
+                        std::make_unique<device_scan_benchmark<
+                            false,
+                            T,
+                            rocprim::plus<T>,
+                            rocprim::scan_config<block_size,
                                                  ItemsPerThread,
                                                  rocprim::block_load_method::block_load_transpose,
                                                  rocprim::block_store_method::block_store_transpose,
diff --git a/benchmark/benchmark_device_scan_by_key.parallel.hpp b/benchmark/benchmark_device_scan_by_key.parallel.hpp
index cd878273f..e4748901a 100644
--- a/benchmark/benchmark_device_scan_by_key.parallel.hpp
+++ b/benchmark/benchmark_device_scan_by_key.parallel.hpp
@@ -258,7 +258,7 @@ struct device_scan_by_key_benchmark_generator
                                              rocprim::plus<ValueType>,
                                              rocprim::equal_to<KeyType>,
                                              1024,
-                                             rocprim::scan_by_key_config_v2<
+                                             rocprim::scan_by_key_config<
                                                  block_size,
                                                  ItemsPerThread,
                                                  rocprim::block_load_method::block_load_transpose,
diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp
index 93931a2f3..2d0c578f4 100644
--- a/benchmark/benchmark_utils.hpp
+++ b/benchmark/benchmark_utils.hpp
@@ -261,9 +261,11 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size =
     return data;
 }
 
-inline bool is_warp_size_supported(const unsigned int required_warp_size)
+inline bool is_warp_size_supported(const unsigned int required_warp_size, const int device_id)
 {
-    return ::rocprim::host_warp_size() >= required_warp_size;
+    unsigned int warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, warp_size));
+    return warp_size >= required_warp_size;
 }
 
 template<unsigned int LogicalWarpSize>
diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp
index 3fd58f5c3..2abbe389d 100644
--- a/benchmark/benchmark_warp_exchange.cpp
+++ b/benchmark/benchmark_warp_exchange.cpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -350,7 +350,9 @@ int main(int argc, char *argv[])
         CREATE_BENCHMARK(int, 256, 16, 32, ScatterToStripedOp)
     };
 
-    if(is_warp_size_supported(64))
+    int hip_device = 0;
+    HIP_CHECK(::rocprim::detail::get_device_from_stream(stream, hip_device));
+    if(is_warp_size_supported(64, hip_device))
     {
         std::vector<benchmark::internal::Benchmark*> additional_benchmarks{
             CREATE_BENCHMARK(int, 256,  1, 64, BlockedToStripedOp),
diff --git a/docs/device_ops/sort.rst b/docs/device_ops/sort.rst
index dcaf2778d..a0674e59c 100644
--- a/docs/device_ops/sort.rst
+++ b/docs/device_ops/sort.rst
@@ -7,7 +7,7 @@ Configuring the kernel
 merge_sort
 ..........
 
-.. doxygentypedef:: rocprim::merge_sort_config
+.. doxygenstruct:: rocprim::merge_sort_config
 
 radix_sort
 ..........
diff --git a/docs/device_ops/transform.rst b/docs/device_ops/transform.rst
index a401140bf..b38eb45bc 100644
--- a/docs/device_ops/transform.rst
+++ b/docs/device_ops/transform.rst
@@ -4,7 +4,7 @@ Transform
 Configuring the kernel
 ~~~~~~~~~~~~~~~~~~~~~~
 
-.. doxygentypedef:: rocprim::transform_config
+.. doxygenstruct:: rocprim::transform_config
 
 transform
 ~~~~~~~~~
diff --git a/docs/intrinsics.rst b/docs/intrinsics.rst
index dc5ae191c..d4b379427 100644
--- a/docs/intrinsics.rst
+++ b/docs/intrinsics.rst
@@ -12,7 +12,8 @@ Warp size
 ---------
 
 .. doxygenfunction:: rocprim::warp_size()
-.. doxygenfunction:: rocprim::host_warp_size()
+.. doxygenfunction:: rocprim::host_warp_size(const int device_id, unsigned int& warp_size)
+.. doxygenfunction:: rocprim::host_warp_size(const hipStream_t stream, unsigned int& warp_size)
 .. doxygenfunction:: rocprim::device_warp_size()
 
 Lane and Warp ID
diff --git a/rmake.py b/rmake.py
index e3d0816cb..265577f1d 100644
--- a/rmake.py
+++ b/rmake.py
@@ -101,6 +101,12 @@ def config_cmd():
         #set CPACK_PACKAGING_INSTALL_PREFIX= defined as blank as it is appended to end of path for archive creation
         cmake_platform_opts.append( f"-DWIN32=ON -DCPACK_PACKAGING_INSTALL_PREFIX=") #" -DCPACK_PACKAGING_INSTALL_PREFIX={rocm_path}"
         cmake_platform_opts.append( f"-DCMAKE_INSTALL_PREFIX=\"C:/hipSDK\"" )
+
+        # MSVC requires acknowledgement of using extended aligned storage.
+        # Before VS 2017 15.8, has non-conforming alignment. VS 2017 15.8 fixes this, but inherently changes layouts of
+        # aligned storage with extended alignment, and thus binary compatibility with such types.
+        cmake_platform_opts.append( "-DCMAKE_CXX_FLAGS=\"-D_ENABLE_EXTENDED_ALIGNED_STORAGE\"")
+
         rocm_cmake_path = '"' + cmake_path(os.getenv("ROCM_CMAKE_PATH", "C:/hipSDK")) + '"'
         generator = f"-G Ninja"
         # "-G \"Visual Studio 16 2019\" -A x64"  #  -G NMake ")  #
diff --git a/rocprim/include/rocprim/block/block_radix_sort.hpp b/rocprim/include/rocprim/block/block_radix_sort.hpp
index f94deb0e0..71c1f37f0 100644
--- a/rocprim/include/rocprim/block/block_radix_sort.hpp
+++ b/rocprim/include/rocprim/block/block_radix_sort.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -884,6 +884,18 @@ class block_radix_sort
     }
 };
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+template<class Key,
+         unsigned int BlockSizeX,
+         unsigned int ItemsPerThread,
+         class Value,
+         unsigned int BlockSizeY,
+         unsigned int BlockSizeZ>
+constexpr unsigned int
+    block_radix_sort<Key, BlockSizeX, ItemsPerThread, Value, BlockSizeY, BlockSizeZ>::
+        radix_bits_per_pass;
+#endif
+
 END_ROCPRIM_NAMESPACE
 
 /// @}
diff --git a/rocprim/include/rocprim/block/block_run_length_decode.hpp b/rocprim/include/rocprim/block/block_run_length_decode.hpp
new file mode 100644
index 000000000..98b3c2515
--- /dev/null
+++ b/rocprim/include/rocprim/block/block_run_length_decode.hpp
@@ -0,0 +1,374 @@
+/******************************************************************************
+ * Copyright (c) 2010-2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * Modifications Copyright (c) 2021-2023, Advanced Micro Devices, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#ifndef ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
+#define ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_
+
+#include "../block/block_scan.hpp"
+#include "../config.hpp"
+#include "../detail/temp_storage.hpp"
+#include "../detail/various.hpp"
+#include "../functional.hpp"
+#include "../intrinsics/thread.hpp"
+#include "../thread/thread_search.hpp"
+
+BEGIN_ROCPRIM_NAMESPACE
+
+/**
+ * \brief The block_run_length_decode class supports decoding a run-length encoded array of items. That is, given
+ * the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output
+ * array.
+ * Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded
+ * array is runtime-dependent and potentially without any upper bound. To address this, block_run_length_decode allows
+ * retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS *
+ * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned.
+ *
+ * \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array).
+ * A run of length zero may not be followed by a run length that is not zero.
+ *
+ * \par
+ * \code
+ * __global__ void ExampleKernel(...)
+ * {
+ *   // Specialising block_run_length_decode to run-length decode items of type uint64_t
+ *   using RunItemT = uint64_t;
+ *   // Type large enough to index into the run-length decoded array
+ *   using RunLengthT = uint32_t;
+ *
+ *   // Specialising block_run_length_decode for a 1D block of 128 threads
+ *   constexpr int BLOCK_DIM_X = 128;
+ *   // Specialising block_run_length_decode to have each thread contribute 2 run-length encoded runs
+ *   constexpr int RUNS_PER_THREAD = 2;
+ *   // Specialising block_run_length_decode to have each thread hold 4 run-length decoded items
+ *   constexpr int DECODED_ITEMS_PER_THREAD = 4;
+ *
+ *   // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+ *   using block_run_length_decodeT =
+ *     hipcub::block_run_length_decode<RunItemT, BLOCK_DIM_X, RUNS_PER_THREAD, DECODED_ITEMS_PER_THREAD>;
+ *
+ *   // Allocate shared memory for block_run_length_decode
+ *   __shared__ typename block_run_length_decodeT::TempStorage temp_storage;
+ *
+ *   // The run-length encoded items and how often they shall be repeated in the run-length decoded output
+ *   RunItemT run_values[RUNS_PER_THREAD];
+ *   RunLengthT run_lengths[RUNS_PER_THREAD];
+ *   ...
+ *
+ *   // Initialize the block_run_length_decode with the runs that we want to run-length decode
+ *   uint32_t total_decoded_size = 0;
+ *   block_run_length_decodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size);
+ *
+ *   // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs
+ *   // have been decoded.
+ *   uint32_t decoded_window_offset = 0U;
+ *   while (decoded_window_offset < total_decoded_size)
+ *   {
+ *     RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD];
+ *     RunItemT decoded_items[DECODED_ITEMS_PER_THREAD];
+ *
+ *     // The number of decoded items that are valid within this window (aka pass) of run-length decoding
+ *     uint32_t num_valid_items = total_decoded_size - decoded_window_offset;
+ *     block_rld.run_length_decode(decoded_items, relative_offsets, decoded_window_offset);
+ *
+ *     decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD;
+ *
+ *     ...
+ *   }
+ * }
+ * \endcode
+ * \par
+ * Suppose the set of input \p run_values across the block of threads is
+ * <tt>{ [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] }</tt> and
+ * \p run_lengths is <tt>{ [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }</tt>.
+ * The corresponding output \p decoded_items in those threads will be <tt>{ [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4],
+ * [4, 4, 4, 5], ..., [169, 169, 170, 171] }</tt> and \p relative_offsets will be <tt>{ [0, 0, 1, 0], [1, 2, 0, 1], [2,
+ * 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] }</tt> during the first iteration of the while loop.
+ *
+ * \tparam ItemT The data type of the items being run-length decoded
+ * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
+ * \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes
+ * \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds
+ * \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the
+ * runs' lengths)
+ * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension
+ * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension
+ */
+template<typename ItemT,
+         unsigned int BlockSizeX,
+         int          RUNS_PER_THREAD,
+         int          DECODED_ITEMS_PER_THREAD,
+         typename DecodedOffsetT = uint32_t,
+         unsigned int BlockSizeY = 1,
+         unsigned int BlockSizeZ = 1>
+class block_run_length_decode
+{
+private:
+    /// The thread block size in threads
+    static constexpr int BLOCK_THREADS = BlockSizeX * BlockSizeY * BlockSizeZ;
+
+    /// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0')
+    static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD;
+
+    /// block_scan used to determine the beginning of each run (i.e., prefix sum over the runs' length)
+    using block_scan_type = rocprim::block_scan<DecodedOffsetT,
+                                                BlockSizeX,
+                                                rocprim::block_scan_algorithm::using_warp_scan,
+                                                BlockSizeY,
+                                                BlockSizeZ>;
+
+    /// Type used to index into the block's runs
+    using RunOffsetT = uint32_t;
+
+    /// Shared memory type required by this thread block
+    union storage_type_
+    {
+        typename block_scan_type::storage_type offset_scan;
+        struct
+        {
+            ItemT          run_values[BLOCK_RUNS];
+            DecodedOffsetT run_offsets[BLOCK_RUNS];
+        } runs;
+    };
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE storage_type_& private_storage()
+    {
+        ROCPRIM_SHARED_MEMORY storage_type private_storage;
+        return private_storage.get();
+    }
+
+    storage_type_& temp_storage;
+
+    uint32_t linear_tid;
+
+public:
+    /// \brief Struct used to allocate a temporary memory that is required for thread
+    /// communication during operations provided by related parallel primitive.
+    ///
+    /// Depending on the implemention the operations exposed by parallel primitive may
+    /// require a temporary storage for thread communication. The storage should be allocated
+    /// using keywords <tt>__shared__</tt>. It can be aliased to
+    /// an externally allocated memory, or be a part of a union type with other storage types
+    /// to increase shared memory reusability.
+    using storage_type = detail::raw_storage<storage_type_>;
+
+    /**
+   * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The
+   * algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+   * <b>run_length_decode</b> calls.
+   */
+    template<typename RunLengthT, typename TotalDecodedSizeT>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+        block_run_length_decode(storage_type& temp_storage,
+                                ItemT (&run_values)[RUNS_PER_THREAD],
+                                RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                TotalDecodedSizeT& total_decoded_size)
+        : temp_storage(temp_storage.get())
+        , linear_tid(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>())
+    {
+        init_with_run_lengths(run_values, run_lengths, total_decoded_size);
+    }
+
+    /**
+     * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The
+     * algorithm's temporary storage may not be repurposed between the constructor call and subsequent
+     * <b>run_length_decode</b> calls.
+     */
+    template<typename UserRunOffsetT>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+        block_run_length_decode(storage_type& temp_storage,
+                                ItemT (&run_values)[RUNS_PER_THREAD],
+                                UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+        : temp_storage(temp_storage.get())
+        , linear_tid(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>())
+    {
+        init_with_run_offsets(run_values, run_offsets);
+    }
+
+    /**
+     * \brief Constructor specialised for static temporary storage, initializing using the runs' lengths.
+     */
+    template<typename RunLengthT, typename TotalDecodedSizeT>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+        block_run_length_decode(ItemT (&run_values)[RUNS_PER_THREAD],
+                                RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                                TotalDecodedSizeT& total_decoded_size)
+        : temp_storage(private_storage())
+        , linear_tid(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>())
+    {
+        init_with_run_lengths(run_values, run_lengths, total_decoded_size);
+    }
+
+    /**
+     * \brief Constructor specialised for static temporary storage, initializing using the runs' offsets.
+     */
+    template<typename UserRunOffsetT>
+    ROCPRIM_DEVICE ROCPRIM_INLINE
+        block_run_length_decode(ItemT (&run_values)[RUNS_PER_THREAD],
+                                UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+        : temp_storage(private_storage())
+        , linear_tid(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>())
+    {
+        init_with_run_offsets(run_values, run_offsets);
+    }
+
+private:
+    template<typename RunOffsetT>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void
+        init_with_run_offsets(ItemT (&run_values)[RUNS_PER_THREAD],
+                              RunOffsetT (&run_offsets)[RUNS_PER_THREAD])
+    {
+        // Keep the runs' items and the offsets of each run's beginning in the temporary storage
+        RunOffsetT thread_dst_offset
+            = static_cast<RunOffsetT>(linear_tid) * static_cast<RunOffsetT>(RUNS_PER_THREAD);
+
+#pragma unroll
+        for(int i = 0; i < RUNS_PER_THREAD; ++i, ++thread_dst_offset)
+        {
+            temp_storage.runs.run_values[thread_dst_offset]  = run_values[i];
+            temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i];
+        }
+
+        // Ensure run offsets and run values have been writen to shared memory
+        syncthreads();
+    }
+
+    template<typename RunLengthT, typename TotalDecodedSizeT>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void
+        init_with_run_lengths(ItemT (&run_values)[RUNS_PER_THREAD],
+                              RunLengthT (&run_lengths)[RUNS_PER_THREAD],
+                              TotalDecodedSizeT& total_decoded_size)
+    {
+        // Compute the offset for the beginning of each run
+        DecodedOffsetT run_offsets[RUNS_PER_THREAD];
+#pragma unroll
+        for(int i = 0; i < RUNS_PER_THREAD; ++i)
+        {
+            run_offsets[i] = static_cast<DecodedOffsetT>(run_lengths[i]);
+        }
+
+        DecodedOffsetT decoded_size_aggregate{};
+        block_scan_type().exclusive_scan(run_offsets,
+                                         run_offsets,
+                                         0,
+                                         decoded_size_aggregate,
+                                         temp_storage.offset_scan,
+                                         rocprim::plus<DecodedOffsetT>{});
+        total_decoded_size = static_cast<TotalDecodedSizeT>(decoded_size_aggregate);
+
+        // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation)
+        syncthreads();
+
+        init_with_run_offsets(run_values, run_offsets);
+    }
+
+public:
+    /**
+     * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
+     * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
+     * run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
+     * the buffer are returned. Subsequent calls to <b>run_length_decode</b> adjusting \p from_decoded_offset can be
+     * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
+     * <b>run_length_decode</b> is not required.
+     * \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the
+     * run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length
+     * decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`.
+     *
+     * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
+     * \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to
+     * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
+     * in undefined behavior.
+     */
+    template<typename RelativeOffsetT>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void
+        run_length_decode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
+                          RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD],
+                          DecodedOffsetT from_decoded_offset = 0)
+    {
+        // The (global) offset of the first item decoded by this thread
+        DecodedOffsetT thread_decoded_offset
+            = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD;
+
+        // The run that the first decoded item of this thread belongs to
+        // If this thread's <thread_decoded_offset> is already beyond the total decoded size, it will be assigned to the
+        // last run
+        RunOffsetT current_run
+            = rocprim::static_upper_bound<BLOCK_RUNS>(temp_storage.runs.run_offsets,
+                                                      BLOCK_RUNS,
+                                                      thread_decoded_offset)
+              - static_cast<RunOffsetT>(1U);
+
+        // Set the current_run_end to thread_decoded_offset to trigger new run branch in the first iteration
+        DecodedOffsetT current_run_begin, current_run_end = thread_decoded_offset;
+
+        ItemT val{};
+
+#pragma unroll
+        for(DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; ++i, ++thread_decoded_offset)
+        {
+            // If we are in a new run...
+            if(thread_decoded_offset == current_run_end)
+            {
+                // The value of the new run
+                val = temp_storage.runs.run_values[current_run];
+
+                // The run bounds
+                current_run_begin = temp_storage.runs.run_offsets[current_run];
+                current_run_end   = temp_storage.runs.run_offsets[++current_run];
+            }
+
+            // Decode the current run by storing the run's value
+            decoded_items[i] = val;
+            item_offsets[i]  = thread_decoded_offset - current_run_begin;
+        }
+    }
+
+    /**
+     * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded
+     * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the
+     * run-length decode buffer (i.e., <b>DECODED_ITEMS_PER_THREAD * BLOCK_THREADS</b>), only the items that fit within
+     * the buffer are returned. Subsequent calls to <b>run_length_decode</b> adjusting \p from_decoded_offset can be
+     * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to
+     * <b>run_length_decode</b> is not required.
+     *
+     * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement
+     * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results
+     * in undefined behavior.
+     */
+    ROCPRIM_DEVICE ROCPRIM_INLINE void
+        run_length_decode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD],
+                          DecodedOffsetT from_decoded_offset = 0)
+    {
+        DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD];
+        run_length_decode(decoded_items, item_offsets, from_decoded_offset);
+    }
+};
+
+END_ROCPRIM_NAMESPACE
+
+#endif
diff --git a/rocprim/include/rocprim/block/block_shuffle.hpp b/rocprim/include/rocprim/block/block_shuffle.hpp
index 3a6e5abc3..50865e19e 100644
--- a/rocprim/include/rocprim/block/block_shuffle.hpp
+++ b/rocprim/include/rocprim/block/block_shuffle.hpp
@@ -1,7 +1,7 @@
 /******************************************************************************
  * Copyright (c) 2011, Duane Merrill.  All rights reserved.
  * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
- * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc.  All rights reserved.
+ * Modifications Copyright (c) 2021-2023, Advanced Micro Devices, Inc.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -35,11 +35,11 @@
 #include "../config.hpp"
 #include "../detail/various.hpp"
 
-#include "../intrinsics.hpp"
 #include "../functional.hpp"
+#include "../intrinsics.hpp"
 
-#include "detail/block_reduce_warp_reduce.hpp"
 #include "detail/block_reduce_raking_reduce.hpp"
+#include "detail/block_reduce_warp_reduce.hpp"
 
 /// \addtogroup blockmodule
 /// @{
@@ -87,11 +87,7 @@ BEGIN_ROCPRIM_NAMESPACE
 /// }
 /// \endcode
 /// \endparblock
-template<
-    class T,
-    unsigned int BlockSizeX,
-    unsigned int BlockSizeY = 1,
-    unsigned int BlockSizeZ = 1>
+template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
 class block_shuffle
 {
     static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
@@ -99,25 +95,23 @@ class block_shuffle
     // Struct used for creating a raw_storage object for this primitive's temporary storage.
     struct storage_type_
     {
-        T prev[BlockSize];
-        T next[BlockSize];
+        T buffer[BlockSize];
     };
 
 public:
-
-    /// \brief Struct used to allocate a temporary memory that is required for thread
-    /// communication during operations provided by related parallel primitive.
-    ///
-    /// Depending on the implemention the operations exposed by parallel primitive may
-    /// require a temporary storage for thread communication. The storage should be allocated
-    /// using keywords <tt>__shared__</tt>. It can be aliased to
-    /// an externally allocated memory, or be a part of a union type with other storage types
-    /// to increase shared memory reusability.
-    #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
-        using storage_type = detail::raw_storage<storage_type_>;
-    #else
-        using storage_type = storage_type_; // only for Doxygen
-    #endif
+/// \brief Struct used to allocate a temporary memory that is required for thread
+/// communication during operations provided by related parallel primitive.
+///
+/// Depending on the implemention the operations exposed by parallel primitive may
+/// require a temporary storage for thread communication. The storage should be allocated
+/// using keywords <tt>__shared__</tt>. It can be aliased to
+/// an externally allocated memory, or be a part of a union type with other storage types
+/// to increase shared memory reusability.
+#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
+    using storage_type = detail::raw_storage<storage_type_>;
+#else
+    using storage_type = storage_type_; // only for Doxygen
+#endif
 
     /// \brief Shuffles data across threads in a block, offseted by the distance value.
     ///
@@ -144,15 +138,12 @@ class block_shuffle
     ///     ...
     /// }
     /// \endcode
-    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
-    void offset(T input,
-                T& output,
-                int distance = 1)
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void offset(T input, T& output, int distance = 1)
     {
-        offset(
-            ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
-            input, output, distance
-        );
+        offset(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
+               input,
+               output,
+               distance);
     }
 
     /// \brief Shuffles data across threads in a block, offseted by the distance value.
@@ -164,11 +155,8 @@ class block_shuffle
     /// \param [in] input - input data to be shuffled to another thread.
     /// \param [out] output - reference to a output value, that receives data from another thread
     /// \param [in] distance - The input threadId + distance = output threadId.
-    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
-    void offset(const size_t& flat_id,
-                T input,
-                T& output,
-                int distance)
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void
+        offset(const size_t& flat_id, T input, T& output, int distance)
     {
         ROCPRIM_SHARED_MEMORY storage_type storage;
         offset(flat_id, input, output, distance, storage);
@@ -184,22 +172,19 @@ class block_shuffle
     /// \param [out] output - reference to a output value, that receives data from another thread
     /// \param [in] distance - The input threadId + distance = output threadId.
     /// \param [in] storage - reference to a temporary storage object of type storage_type.
-    ROCPRIM_DEVICE ROCPRIM_INLINE
-    void offset(const size_t& flat_id,
-                T input,
-                T& output,
-                int distance,
-                storage_type& storage)
+    ROCPRIM_DEVICE ROCPRIM_INLINE void
+        offset(const size_t& flat_id, T input, T& output, int distance, storage_type& storage)
     {
-        storage_type_& storage_ = storage.get();
-        storage_.prev[flat_id] = input;
+        storage_type_& storage_  = storage.get();
+        storage_.buffer[flat_id] = input;
+
+        const int offset_tid = static_cast<int>(flat_id) + distance;
 
         ::rocprim::syncthreads();
 
-        const int offset_tid = static_cast<int>(flat_id) + distance;
-        if ((offset_tid >= 0) && (offset_tid < (int)BlockSize))
+        if((offset_tid >= 0) && (offset_tid < (int)BlockSize))
         {
-            output = storage_.prev[static_cast<size_t>(offset_tid)];
+            output = storage_.buffer[static_cast<size_t>(offset_tid)];
         }
     }
 
@@ -210,7 +195,7 @@ class block_shuffle
     ///
     /// \param [in] input - input data to be shuffled to another thread.
     /// \param [out] output - reference to a output value, that receives data from another thread
-    /// \param [in] distance - The input threadId + distance = output threadId.
+    /// \param [in] distance - The input threadId + distance = output threadId. Distance magnitude should be <= BlockSize.
     ///
     /// \par Example.
     /// \code{.cpp}
@@ -228,15 +213,12 @@ class block_shuffle
     ///     ...
     /// }
     /// \endcode
-    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
-    void rotate(T input,
-                T& output,
-                unsigned int distance = 1)
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void rotate(T input, T& output, int distance = 1)
     {
-        rotate(
-            ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
-            input, output, distance
-        );
+        rotate(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
+               input,
+               output,
+               distance);
     }
 
     /// \brief Shuffles data across threads in a block, offseted by the distance value.
@@ -248,11 +230,8 @@ class block_shuffle
     /// \param [in] input - input data to be shuffled to another thread.
     /// \param [out] output - reference to a output value, that receives data from another thread
     /// \param [in] distance - The input threadId + distance = output threadId.
-    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
-    void rotate(const size_t& flat_id,
-                T input,
-                T& output,
-                unsigned int distance)
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void
+        rotate(const size_t& flat_id, T input, T& output, int distance)
     {
         ROCPRIM_SHARED_MEMORY storage_type storage;
         rotate(flat_id, input, output, distance, storage);
@@ -268,25 +247,26 @@ class block_shuffle
     /// \param [out] output - reference to a output value, that receives data from another thread
     /// \param [in] distance - The input threadId + distance = output threadId.
     /// \param [in] storage - reference to a temporary storage object of type storage_type.
-    ROCPRIM_DEVICE ROCPRIM_INLINE
-    void rotate(const size_t& flat_id,
-                T input,
-                T& output,
-                unsigned int distance,
-                storage_type& storage)
+    ROCPRIM_DEVICE ROCPRIM_INLINE void
+        rotate(const size_t& flat_id, T input, T& output, int distance, storage_type& storage)
     {
-        storage_type_& storage_ = storage.get();
-        storage_.prev[flat_id] = input;
+        storage_type_& storage_  = storage.get();
+        storage_.buffer[flat_id] = input;
 
-        ::rocprim::syncthreads();
-
-        unsigned int offset = threadIdx.x + distance;
-        if (offset >= BlockSize)
+        int offset = static_cast<int>(flat_id) + distance;
+        if(offset >= (int)BlockSize)
+        {
             offset -= BlockSize;
+        }
+        else if(offset < 0)
+        {
+            offset += BlockSize;
+        }
 
-        output = storage_.prev[offset];
-    }
+        ::rocprim::syncthreads();
 
+        output = storage_.buffer[offset];
+    }
 
     /// \brief The thread block rotates a blocked arrange of input items,
     /// shifting it up by one item
@@ -311,15 +291,13 @@ class block_shuffle
     ///     ...
     /// }
     /// \endcode
-    template <unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
-    void up(T (&input)[ItemsPerThread],
-            T (&prev)[ItemsPerThread])
+    template<unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(T (&input)[ItemsPerThread],
+                                                T (&prev)[ItemsPerThread])
     {
-        this->up(
-            ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
-            input, prev
-        );
+        this->up(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
+                 input,
+                 prev);
     }
 
     /// \brief The thread block rotates a blocked arrange of input items,
@@ -329,11 +307,9 @@ class block_shuffle
     /// \param [in]  input -  The calling thread's input items
     /// \param [out] prev  -  The corresponding predecessor items (may be aliased to \p input).
     /// The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
-    template <unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
-    void up(const size_t& flat_id,
-            T (&input)[ItemsPerThread],
-            T (&prev)[ItemsPerThread])
+    template<unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void
+        up(const size_t& flat_id, T (&input)[ItemsPerThread], T (&prev)[ItemsPerThread])
     {
         ROCPRIM_SHARED_MEMORY storage_type storage;
         this->up(flat_id, input, prev, storage);
@@ -347,31 +323,29 @@ class block_shuffle
     /// \param [out] prev  -  The corresponding predecessor items (may be aliased to \p input).
     /// \param [in] storage - reference to a temporary storage object of type storage_type.
     /// The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
-    template <unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_INLINE
-    void up(const size_t& flat_id,
-            T (&input)[ItemsPerThread],
-            T (&prev)[ItemsPerThread],
-            storage_type& storage)
+    template<unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t& flat_id,
+                                          T (&input)[ItemsPerThread],
+                                          T (&prev)[ItemsPerThread],
+                                          storage_type& storage)
     {
-        storage_type_& storage_ = storage.get();
-        storage_.prev[flat_id] = input[ItemsPerThread -1];
-
-        ::rocprim::syncthreads();
+        storage_type_& storage_  = storage.get();
+        storage_.buffer[flat_id] = input[ItemsPerThread - 1];
 
         ROCPRIM_UNROLL
-        for (unsigned int i = ItemsPerThread - 1; i > 0; --i)
+        for(unsigned int i = ItemsPerThread - 1; i > 0; --i)
         {
             prev[i] = input[i - 1];
         }
 
-        if (flat_id > 0)
+        ::rocprim::syncthreads();
+
+        if(flat_id > 0)
         {
-            prev[0] = storage_.prev[flat_id - 1];
+            prev[0] = storage_.buffer[flat_id - 1];
         }
     }
 
-
     /// \brief The thread block rotates a blocked arrange of input items,
     /// shifting it up by one item
     ///
@@ -380,16 +354,14 @@ class block_shuffle
     /// The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
     /// \param [out] block_suffix - The item \p input[ItemsPerThread-1] from
     /// <em>thread</em><sub><tt>BlockSize-1</tt></sub>, provided to all threads
-    template <unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
-    void up(T (&input)[ItemsPerThread],
-            T (&prev)[ItemsPerThread],
-            T &block_suffix)
+    template<unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void
+        up(T (&input)[ItemsPerThread], T (&prev)[ItemsPerThread], T& block_suffix)
     {
-        this->up(
-            ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
-            input, prev, block_suffix
-        );
+        this->up(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
+                 input,
+                 prev,
+                 block_suffix);
     }
 
     /// \brief The thread block rotates a blocked arrange of input items,
@@ -401,12 +373,11 @@ class block_shuffle
     /// The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
     /// \param [out] block_suffix - The item \p input[ItemsPerThread-1] from
     /// <em>thread</em><sub><tt>BlockSize-1</tt></sub>, provided to all threads
-    template <unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
-    void up(const size_t& flat_id,
-            T (&input)[ItemsPerThread],
-            T (&prev)[ItemsPerThread],
-            T &block_suffix)
+    template<unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(const size_t& flat_id,
+                                                T (&input)[ItemsPerThread],
+                                                T (&prev)[ItemsPerThread],
+                                                T& block_suffix)
     {
         ROCPRIM_SHARED_MEMORY storage_type storage;
         this->up(flat_id, input, prev, block_suffix, storage);
@@ -422,18 +393,17 @@ class block_shuffle
     /// \param [out] block_suffix - The item \p input[ItemsPerThread-1] from
     /// <em>thread</em><sub><tt>BlockSize-1</tt></sub>, provided to all threads
     /// \param [in] storage - reference to a temporary storage object of type storage_type.
-    template <int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_INLINE
-    void up(const size_t& flat_id,
-            T (&input)[ItemsPerThread],
-            T (&prev)[ItemsPerThread],
-            T &block_suffix,
-            storage_type& storage)
+    template<int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t& flat_id,
+                                          T (&input)[ItemsPerThread],
+                                          T (&prev)[ItemsPerThread],
+                                          T&            block_suffix,
+                                          storage_type& storage)
     {
         up(flat_id, input, prev, storage);
 
         // Update block prefix
-        block_suffix = storage->prev[BlockSize - 1];
+        block_suffix = storage->buffer[BlockSize - 1];
     }
 
     /// \brief The thread block rotates a blocked arrange of input items,
@@ -459,15 +429,13 @@ class block_shuffle
     ///     ...
     /// }
     /// \endcode
-    template <unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
-    void down(T (&input)[ItemsPerThread],
-              T (&next)[ItemsPerThread])
+    template<unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(T (&input)[ItemsPerThread],
+                                                  T (&next)[ItemsPerThread])
     {
-        this->down(
-            ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
-            input, next
-        );
+        this->down(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
+                   input,
+                   next);
     }
 
     /// \brief The thread block rotates a blocked arrange of input items,
@@ -477,11 +445,9 @@ class block_shuffle
     /// \param [in]  input -  The calling thread's input items
     /// \param [out] next  -  The corresponding successor items (may be aliased to \p input).
     /// The item \p prev[0] is not updated for <em>thread</em><sub>BlockSize - 1</sub>.
-    template <unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
-    void down(const size_t& flat_id,
-              T (&input)[ItemsPerThread],
-              T (&next)[ItemsPerThread])
+    template<unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void
+        down(const size_t& flat_id, T (&input)[ItemsPerThread], T (&next)[ItemsPerThread])
     {
         ROCPRIM_SHARED_MEMORY storage_type storage;
         this->down(flat_id, input, next, storage);
@@ -495,27 +461,26 @@ class block_shuffle
     /// \param [out] next  -  The corresponding successor items (may be aliased to \p input).
     /// The item \p prev[0] is not updated for <em>thread</em><sub>BlockSize - 1</sub>.
     /// \param [in] storage - reference to a temporary storage object of type storage_type.
-    template <unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_INLINE
-    void down(const size_t& flat_id,
-              T (&input)[ItemsPerThread],
-              T (&next)[ItemsPerThread],
-              storage_type& storage)
+    template<unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t& flat_id,
+                                            T (&input)[ItemsPerThread],
+                                            T (&next)[ItemsPerThread],
+                                            storage_type& storage)
     {
-        storage_type_& storage_ = storage.get();
-        storage_.next[flat_id] = input[0];
-
-        ::rocprim::syncthreads();
+        storage_type_& storage_  = storage.get();
+        storage_.buffer[flat_id] = input[0];
 
         ROCPRIM_UNROLL
-        for (unsigned int i = 0; i < (ItemsPerThread - 1); ++i)
+        for(unsigned int i = 0; i < (ItemsPerThread - 1); ++i)
         {
-          next[i] = input[i + 1];
+            next[i] = input[i + 1];
         }
 
-        if (flat_id <(BlockSize -1))
+        ::rocprim::syncthreads();
+
+        if(flat_id < (BlockSize - 1))
         {
-          next[ItemsPerThread -1] = storage_.next[flat_id + 1];
+            next[ItemsPerThread - 1] = storage_.buffer[flat_id + 1];
         }
     }
 
@@ -526,16 +491,14 @@ class block_shuffle
     /// \param [out] next  -  The corresponding successor items (may be aliased to \p input).
     /// The item \p prev[0] is not updated for <em>thread</em><sub>BlockSize - 1</sub>.
     /// \param [out] block_prefix -  The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
-    template <unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
-    void down(T (&input)[ItemsPerThread],
-              T (&next)[ItemsPerThread],
-              T &block_prefix)
+    template<unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void
+        down(T (&input)[ItemsPerThread], T (&next)[ItemsPerThread], T& block_prefix)
     {
-        this->down(
-            ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
-            input, next, block_prefix
-        );
+        this->down(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
+                   input,
+                   next,
+                   block_prefix);
     }
 
     /// \brief The thread block rotates a blocked arrange of input items,
@@ -546,12 +509,11 @@ class block_shuffle
     /// \param [out] next  -  The corresponding successor items (may be aliased to \p input).
     /// The item \p prev[0] is not updated for <em>thread</em><sub>BlockSize - 1</sub>.
     /// \param [out] block_prefix -  The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
-    template <unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
-    void down(const size_t& flat_id,
-              T (&input)[ItemsPerThread],
-              T (&next)[ItemsPerThread],
-              T &block_prefix)
+    template<unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(const size_t& flat_id,
+                                                  T (&input)[ItemsPerThread],
+                                                  T (&next)[ItemsPerThread],
+                                                  T& block_prefix)
     {
         ROCPRIM_SHARED_MEMORY storage_type storage;
         this->down(flat_id, input, next, block_prefix, storage);
@@ -566,13 +528,12 @@ class block_shuffle
     /// The item \p prev[0] is not updated for <em>thread</em><sub>BlockSize - 1</sub>.
     /// \param [out] block_prefix -  The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
     /// \param [in] storage - reference to a temporary storage object of type storage_type.
-    template <unsigned int ItemsPerThread>
-    ROCPRIM_DEVICE ROCPRIM_INLINE
-    void down(const size_t& flat_id,
-              T (&input)[ItemsPerThread],
-              T (&next)[ItemsPerThread],
-              T &block_prefix,
-              storage_type& storage)
+    template<unsigned int ItemsPerThread>
+    ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t& flat_id,
+                                            T (&input)[ItemsPerThread],
+                                            T (&next)[ItemsPerThread],
+                                            T&            block_prefix,
+                                            storage_type& storage)
     {
         this->down(flat_id, input, next, storage);
 
@@ -581,7 +542,6 @@ class block_shuffle
     }
 };
 
-
 END_ROCPRIM_NAMESPACE
 
 /// @}
diff --git a/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp b/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp
index 21e7613f4..f5cfc031d 100644
--- a/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp
+++ b/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp
@@ -69,14 +69,8 @@ class block_histogram_atomic
             const unsigned int bin = static_cast<unsigned int>(input[i]);
 
             // Get a mask with the threads that have the same value for `bin`.
-            ::rocprim::lane_mask_type peer_mask = ballot(1);
-            ROCPRIM_UNROLL
-            for(unsigned int b = 1; b < Bins; b <<= 1)
-            {
-                const unsigned int bit_set      = bin & b;
-                const auto         bit_set_mask = ballot(bit_set);
-                peer_mask &= (bit_set ? bit_set_mask : ~bit_set_mask);
-            }
+            ::rocprim::lane_mask_type peer_mask
+                = ::rocprim::match_any<::rocprim::Log2<Bins>::VALUE>(bin);
 
             // The total number of threads in the warp which also have this digit.
             const unsigned int bin_count = bit_count(peer_mask);
diff --git a/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp b/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp
index a2c2723b7..abcdb3257 100644
--- a/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp
+++ b/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -113,15 +113,7 @@ class block_radix_rank_match
             const digit_counter_type warp_digit_prefix = *digit_counters[i];
 
             // Construct a mask of threads in this wave which have the same digit.
-            ::rocprim::lane_mask_type peer_mask = ::rocprim::ballot(1);
-
-            ROCPRIM_UNROLL
-            for(unsigned int b = 0; b < RadixBits; ++b)
-            {
-                const unsigned int              bit_set      = digit & (1u << b);
-                const ::rocprim::lane_mask_type bit_set_mask = ::rocprim::ballot(bit_set);
-                peer_mask &= (bit_set ? bit_set_mask : ~bit_set_mask);
-            }
+            ::rocprim::lane_mask_type peer_mask = ::rocprim::match_any<RadixBits>(digit);
 
             ::rocprim::wave_barrier();
 
@@ -131,8 +123,7 @@ class block_radix_rank_match
             // than the current thread's.
             const unsigned int peer_digit_prefix = rocprim::masked_bit_count(peer_mask);
 
-            // The first thread with a particular digit gets to update the shared counter.
-            if(peer_digit_prefix == 0)
+            if(::rocprim::group_elect(peer_mask))
             {
                 *digit_counters[i] = warp_digit_prefix + digit_count;
             }
diff --git a/rocprim/include/rocprim/config.hpp b/rocprim/include/rocprim/config.hpp
index 5bc5e516f..b78afff25 100644
--- a/rocprim/include/rocprim/config.hpp
+++ b/rocprim/include/rocprim/config.hpp
@@ -72,8 +72,22 @@
     #define ROCPRIM_FORCE_INLINE __attribute__((always_inline))
 #endif
 
-#ifndef ROCPRIM_DISABLE_DPP
-    #define ROCPRIM_DETAIL_USE_DPP true
+// DPP is supported only after Volcanic Islands (GFX8+)
+// Only defined when support is present, in contrast to ROCPRIM_DETAIL_USE_DPP, which should be
+// always defined
+#if defined(__HIP_DEVICE_COMPILE__) && defined(__AMDGCN__) \
+    && (!defined(__GFX6__) && !defined(__GFX7__))
+    #define ROCPRIM_DETAIL_HAS_DPP 1
+#endif
+
+#if !defined(ROCPRIM_DISABLE_DPP) && defined(ROCPRIM_DETAIL_HAS_DPP)
+    #define ROCPRIM_DETAIL_USE_DPP 1
+#else
+    #define ROCPRIM_DETAIL_USE_DPP 0
+#endif
+
+#if defined(ROCPRIM_DETAIL_HAS_DPP) && (defined(__GFX8__) || defined(__GFX9__))
+    #define ROCPRIM_DETAIL_HAS_DPP_BROADCAST 1
 #endif
 
 #ifndef ROCPRIM_THREAD_LOAD_USE_CACHE_MODIFIERS
@@ -95,11 +109,12 @@
     #define ROCPRIM_TARGET_ARCH 0
 #endif
 
-#if(__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__ || __gfx1032__ \
-    || __gfx1035__ || __gfx1100__ || __gfx1101__ || __gfx1102__)
-    #define ROCPRIM_NAVI 1
-#else
-    #define ROCPRIM_NAVI 0
+#ifndef ROCPRIM_NAVI
+    #if defined(__HIP_DEVICE_COMPILE__) && (defined(__GFX10__) || defined(__GFX11__))
+        #define ROCPRIM_NAVI 1
+    #else
+        #define ROCPRIM_NAVI 0
+    #endif
 #endif
 #define ROCPRIM_ARCH_90a 910
 
diff --git a/rocprim/include/rocprim/detail/radix_sort.hpp b/rocprim/include/rocprim/detail/radix_sort.hpp
index 66ba2e356..32ff17e87 100644
--- a/rocprim/include/rocprim/detail/radix_sort.hpp
+++ b/rocprim/include/rocprim/detail/radix_sort.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -67,6 +67,33 @@ struct radix_key_codec_integral<Key, BitKey, typename std::enable_if<::rocprim::
     }
 };
 
+template<class Key, class BitKey>
+struct radix_key_codec_integral<
+    Key,
+    BitKey,
+    typename std::enable_if<std::is_same<Key, __uint128_t>::value>::type>
+{
+    using bit_key_type = BitKey;
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE static bit_key_type encode(Key key)
+    {
+        return __builtin_bit_cast(bit_key_type, key);
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE static Key decode(bit_key_type bit_key)
+    {
+        return __builtin_bit_cast(Key, bit_key);
+    }
+
+    template<bool Descending>
+    ROCPRIM_DEVICE static unsigned int
+        extract_digit(bit_key_type bit_key, unsigned int start, unsigned int length)
+    {
+        unsigned int mask = (1u << length) - 1;
+        return static_cast<unsigned int>(bit_key >> start) & mask;
+    }
+};
+
 template<class Key, class BitKey>
 struct radix_key_codec_integral<Key, BitKey, typename std::enable_if<::rocprim::is_signed<Key>::value>::type>
 {
@@ -97,6 +124,36 @@ struct radix_key_codec_integral<Key, BitKey, typename std::enable_if<::rocprim::
     }
 };
 
+template<class Key, class BitKey>
+struct radix_key_codec_integral<Key,
+                                BitKey,
+                                typename std::enable_if<std::is_same<Key, __int128_t>::value>::type>
+{
+    using bit_key_type = BitKey;
+
+    static constexpr bit_key_type sign_bit = bit_key_type(1) << (sizeof(bit_key_type) * 8 - 1);
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE static bit_key_type encode(Key key)
+    {
+        const bit_key_type bit_key = __builtin_bit_cast(bit_key_type, key);
+        return sign_bit ^ bit_key;
+    }
+
+    ROCPRIM_DEVICE ROCPRIM_INLINE static Key decode(bit_key_type bit_key)
+    {
+        bit_key ^= sign_bit;
+        return __builtin_bit_cast(Key, bit_key);
+    }
+
+    template<bool Descending>
+    ROCPRIM_DEVICE static unsigned int
+        extract_digit(bit_key_type bit_key, unsigned int start, unsigned int length)
+    {
+        unsigned int mask = (1u << length) - 1;
+        return static_cast<unsigned int>(bit_key >> start) & mask;
+    }
+};
+
 template<class Key>
 struct float_bit_mask;
 
@@ -199,6 +256,18 @@ struct radix_key_codec_base<
     typename std::enable_if<::rocprim::is_integral<Key>::value>::type
 > : radix_key_codec_integral<Key, typename std::make_unsigned<Key>::type> { };
 
+template<class Key>
+struct radix_key_codec_base<Key,
+                            typename std::enable_if<std::is_same<Key, __int128_t>::value>::type>
+    : radix_key_codec_integral<Key, __uint128_t>
+{};
+
+template<class Key>
+struct radix_key_codec_base<Key,
+                            typename std::enable_if<std::is_same<Key, __uint128_t>::value>::type>
+    : radix_key_codec_integral<Key, __uint128_t>
+{};
+
 template<>
 struct radix_key_codec_base<bool>
 {
diff --git a/rocprim/include/rocprim/device/config_types.hpp b/rocprim/include/rocprim/device/config_types.hpp
index c83934cbc..0b8c75cd8 100644
--- a/rocprim/include/rocprim/device/config_types.hpp
+++ b/rocprim/include/rocprim/device/config_types.hpp
@@ -29,7 +29,6 @@
 #include <cassert>
 
 #include "../config.hpp"
-#include "../intrinsics/thread.hpp"
 #include "../detail/various.hpp"
 
 /// \addtogroup primitivesmodule_deviceconfigs
@@ -49,7 +48,7 @@ struct default_config
     // merge_sort_config
     using block_sort_config  = default_config;
     using block_merge_config = default_config;
-    // radix_sort_config_v2
+    // radix_sort_config
     using single_sort_config = default_config;
     using merge_sort_config  = default_config;
     using onesweep_config    = default_config;
@@ -227,7 +226,7 @@ constexpr target_arch get_target_arch_from_name(const char* const arch_name, con
 /**
  * \brief Get the current architecture in device compilation.
  * 
- * This function will always return `unkown` when called from the host, host could should instead
+ * This function will always return `unknown` when called from the host, host could should instead
  * call host_target_arch to query the current device from the HIP API.
  * 
  * \return target_arch the architecture currently being compiled for on the device.
@@ -318,7 +317,6 @@ inline hipError_t get_device_arch(int device_id, target_arch& arch)
     return hipSuccess;
 }
 
-#ifndef _WIN32
 inline hipError_t get_device_from_stream(const hipStream_t stream, int& device_id)
 {
     static constexpr hipStream_t default_stream = 0;
@@ -343,15 +341,9 @@ inline hipError_t get_device_from_stream(const hipStream_t stream, int& device_i
 #endif
     return hipSuccess;
 }
-#endif
 
 inline hipError_t host_target_arch(const hipStream_t stream, target_arch& arch)
 {
-#ifdef _WIN32
-    (void)stream;
-    arch = target_arch::unknown;
-    return hipSuccess;
-#else
     int              device_id;
     const hipError_t result = get_device_from_stream(stream, device_id);
     if(result != hipSuccess)
@@ -360,11 +352,48 @@ inline hipError_t host_target_arch(const hipStream_t stream, target_arch& arch)
     }
 
     return get_device_arch(device_id, arch);
-#endif
 }
 
 } // end namespace detail
 
+/// \brief Returns a number of threads in a hardware warp for the actual device.
+/// At host side this constant is available at runtime only.
+/// \param device_id - the device that should be queried.
+/// \param warp_size - out parameter for the warp size.
+/// \return hipError_t any error that might occur.
+///
+/// It is constant for a device.
+ROCPRIM_HOST inline hipError_t host_warp_size(const int device_id, unsigned int& warp_size)
+{
+    warp_size = -1;
+    hipDeviceProp_t device_prop;
+    hipError_t      success = hipGetDeviceProperties(&device_prop, device_id);
+
+    if(success == hipSuccess)
+    {
+        warp_size = device_prop.warpSize;
+    }
+    return success;
+};
+
+/// \brief Returns the number of threads in a hardware warp for the device associated with the stream.
+/// At host side this constant is available at runtime only.
+/// \param stream - the stream, whose device should be queried.
+/// \param warp_size - out parameter for the warp size.
+/// \return hipError_t any error that might occur.
+///
+/// It is constant for a device.
+ROCPRIM_HOST inline hipError_t host_warp_size(const hipStream_t stream, unsigned int& warp_size)
+{
+    int        hip_device;
+    hipError_t success = detail::get_device_from_stream(stream, hip_device);
+    if(success == hipSuccess)
+    {
+        return host_warp_size(hip_device, warp_size);
+    }
+    return success;
+};
+
 END_ROCPRIM_NAMESPACE
 
 /// @}
diff --git a/rocprim/include/rocprim/device/detail/config/device_adjacent_difference.hpp b/rocprim/include/rocprim/device/detail/config/device_adjacent_difference.hpp
new file mode 100644
index 000000000..8f5cdabdb
--- /dev/null
+++ b/rocprim/include/rocprim/device/detail/config/device_adjacent_difference.hpp
@@ -0,0 +1,527 @@
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_HPP_
+#define ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_HPP_
+
+#include "../../../type_traits.hpp"
+#include "../device_config_helper.hpp"
+#include <type_traits>
+
+/* DO NOT EDIT THIS FILE
+ * This file is automatically generated by `/scripts/autotune/create_optimization.py`.
+ * so most likely you want to edit rocprim/device/device_(algo)_config.hpp
+ */
+
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+namespace detail
+{
+
+template<unsigned int arch, class value_type, class enable = void>
+struct default_adjacent_difference_config : default_adjacent_difference_config_base<value_type>
+{};
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<512, 4>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 2>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<128, 4>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<512, 4>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 2>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<256, 4>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<32, 8>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<128, 8>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 2>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<32, 4>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<1024, 2>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 2>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<32, 4>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<256, 32>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<128, 64>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<128, 64>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<128, 2>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<512, 4>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<64, 16>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<128, 2>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<512, 4>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<64, 16>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<128, 2>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<64, 8>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<128, 2>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<128, 2>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<128, 8>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<128, 2>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<64, 8>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<128, 2>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<128, 2>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<128, 8>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<128, 2>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<64, 8>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<128, 2>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<128, 2>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<128, 8>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16>
+{};
+
+} // end namespace detail
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group primitivesmodule_deviceconfigs
+
+#endif // ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_HPP_
\ No newline at end of file
diff --git a/rocprim/include/rocprim/device/detail/config/device_adjacent_difference_inplace.hpp b/rocprim/include/rocprim/device/detail/config/device_adjacent_difference_inplace.hpp
new file mode 100644
index 000000000..0718b6e65
--- /dev/null
+++ b/rocprim/include/rocprim/device/detail/config/device_adjacent_difference_inplace.hpp
@@ -0,0 +1,529 @@
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#ifndef ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_INPLACE_HPP_
+#define ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_INPLACE_HPP_
+
+#include "../../../type_traits.hpp"
+#include "../device_config_helper.hpp"
+#include <type_traits>
+
+/* DO NOT EDIT THIS FILE
+ * This file is automatically generated by `/scripts/autotune/create_optimization.py`.
+ * so most likely you want to edit rocprim/device/device_(algo)_config.hpp
+ */
+
+/// \addtogroup primitivesmodule_deviceconfigs
+/// @{
+
+BEGIN_ROCPRIM_NAMESPACE
+
+namespace detail
+{
+
+template<unsigned int arch, class value_type, class enable = void>
+struct default_adjacent_difference_inplace_config
+    : default_adjacent_difference_config_base<value_type>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<128, 16>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<256, 16>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<512, 16>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<128, 16>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<256, 16>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<256, 32>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1102),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<512, 32>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<512, 4>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 4>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<1024, 8>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<512, 4>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 4>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<1024, 8>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx1030),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<32, 64>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<256, 16>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<128, 64>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<256, 64>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<256, 16>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<128, 64>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<256, 64>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx900),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<512, 16>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<1024, 4>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 8>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<256, 16>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<1024, 4>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<512, 16>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<256, 16>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx906),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 4>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<512, 8>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 4>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<64, 32>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx908),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 4>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<512, 8>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 4>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<64, 32>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::unknown),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16>
+{};
+
+// Based on value_type = double
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = float
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 4>
+{};
+
+// Based on value_type = rocprim::half
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2))>> : adjacent_difference_config<512, 8>
+{};
+
+// Based on value_type = int64_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
+    : adjacent_difference_config<512, 2>
+{};
+
+// Based on value_type = int
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
+    : adjacent_difference_config<1024, 4>
+{};
+
+// Based on value_type = short
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
+    : adjacent_difference_config<64, 32>
+{};
+
+// Based on value_type = int8_t
+template<class value_type>
+struct default_adjacent_difference_inplace_config<
+    static_cast<unsigned int>(target_arch::gfx90a),
+    value_type,
+    std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
+                      && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16>
+{};
+
+} // end namespace detail
+
+END_ROCPRIM_NAMESPACE
+
+/// @}
+// end of group primitivesmodule_deviceconfigs
+
+#endif // ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_INPLACE_HPP_
\ No newline at end of file
diff --git a/rocprim/include/rocprim/device/detail/config/device_scan.hpp b/rocprim/include/rocprim/device/detail/config/device_scan.hpp
index 867753a07..7fe8e7259 100644
--- a/rocprim/include/rocprim/device/detail/config/device_scan.hpp
+++ b/rocprim/include/rocprim/device/detail/config/device_scan.hpp
@@ -49,11 +49,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_config_v2<256,
-                     6,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  6,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = float
@@ -63,11 +63,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_config_v2<256,
-                     14,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  14,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = rocprim::half
@@ -76,11 +76,11 @@ struct default_scan_config<static_cast<unsigned int>(target_arch::gfx908),
                            value_type,
                            std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                                              && (sizeof(value_type) <= 2))>>
-    : scan_config_v2<256,
-                     18,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  18,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = int64_t
@@ -90,11 +90,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_config_v2<256,
-                     6,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  6,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int
@@ -104,11 +104,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_config_v2<256,
-                     14,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  14,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = short
@@ -118,11 +118,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_config_v2<128,
-                     14,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<128,
+                  14,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int8_t
@@ -131,11 +131,11 @@ struct default_scan_config<static_cast<unsigned int>(target_arch::gfx908),
                            value_type,
                            std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                                              && (sizeof(value_type) <= 1))>>
-    : scan_config_v2<256,
-                     24,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  24,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = double
@@ -145,11 +145,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_config_v2<256,
-                     15,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  15,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = float
@@ -159,11 +159,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_config_v2<256,
-                     10,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  10,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = rocprim::half
@@ -172,11 +172,11 @@ struct default_scan_config<static_cast<unsigned int>(target_arch::gfx900),
                            value_type,
                            std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                                              && (sizeof(value_type) <= 2))>>
-    : scan_config_v2<256,
-                     24,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  24,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int64_t
@@ -186,11 +186,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_config_v2<256,
-                     10,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  10,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int
@@ -200,11 +200,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_config_v2<256,
-                     10,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  10,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = short
@@ -214,11 +214,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_config_v2<256,
-                     24,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  24,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int8_t
@@ -227,11 +227,11 @@ struct default_scan_config<static_cast<unsigned int>(target_arch::gfx900),
                            value_type,
                            std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                                              && (sizeof(value_type) <= 1))>>
-    : scan_config_v2<256,
-                     24,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  24,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = double
@@ -241,11 +241,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_config_v2<128,
-                     10,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<128,
+                  10,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = float
@@ -255,11 +255,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_config_v2<256,
-                     15,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  15,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = rocprim::half
@@ -268,11 +268,11 @@ struct default_scan_config<static_cast<unsigned int>(target_arch::gfx906),
                            value_type,
                            std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                                              && (sizeof(value_type) <= 2))>>
-    : scan_config_v2<256,
-                     24,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  24,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = int64_t
@@ -282,11 +282,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_config_v2<64,
-                     14,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<64,
+                  14,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int
@@ -296,11 +296,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_config_v2<256,
-                     15,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  15,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = short
@@ -310,11 +310,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_config_v2<256,
-                     24,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  24,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = int8_t
@@ -323,11 +323,11 @@ struct default_scan_config<static_cast<unsigned int>(target_arch::gfx906),
                            value_type,
                            std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                                              && (sizeof(value_type) <= 1))>>
-    : scan_config_v2<256,
-                     24,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  24,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = double
@@ -337,11 +337,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_config_v2<128,
-                     14,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<128,
+                  14,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = float
@@ -351,11 +351,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_config_v2<64,
-                     18,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<64,
+                  18,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = rocprim::half
@@ -364,11 +364,11 @@ struct default_scan_config<static_cast<unsigned int>(target_arch::gfx1030),
                            value_type,
                            std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                                              && (sizeof(value_type) <= 2))>>
-    : scan_config_v2<256,
-                     22,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  22,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int64_t
@@ -378,11 +378,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_config_v2<256,
-                     9,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  9,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int
@@ -392,11 +392,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_config_v2<64,
-                     18,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<64,
+                  18,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = short
@@ -406,11 +406,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_config_v2<256,
-                     22,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  22,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int8_t
@@ -419,11 +419,11 @@ struct default_scan_config<static_cast<unsigned int>(target_arch::gfx1030),
                            value_type,
                            std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                                              && (sizeof(value_type) <= 1))>>
-    : scan_config_v2<256,
-                     24,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  24,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = double
@@ -433,11 +433,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_config_v2<256,
-                     6,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  6,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = float
@@ -447,11 +447,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_config_v2<256,
-                     14,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  14,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = rocprim::half
@@ -460,11 +460,11 @@ struct default_scan_config<static_cast<unsigned int>(target_arch::unknown),
                            value_type,
                            std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                                              && (sizeof(value_type) <= 2))>>
-    : scan_config_v2<256,
-                     18,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  18,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = int64_t
@@ -474,11 +474,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_config_v2<256,
-                     6,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  6,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int
@@ -488,11 +488,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_config_v2<256,
-                     14,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  14,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = short
@@ -502,11 +502,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_config_v2<128,
-                     14,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<128,
+                  14,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int8_t
@@ -515,11 +515,11 @@ struct default_scan_config<static_cast<unsigned int>(target_arch::unknown),
                            value_type,
                            std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                                              && (sizeof(value_type) <= 1))>>
-    : scan_config_v2<256,
-                     24,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  24,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = double
@@ -529,11 +529,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_config_v2<256,
-                     6,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  6,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = float
@@ -543,11 +543,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_config_v2<256,
-                     14,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  14,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = rocprim::half
@@ -556,11 +556,11 @@ struct default_scan_config<static_cast<unsigned int>(target_arch::gfx90a),
                            value_type,
                            std::enable_if_t<(bool(rocprim::is_floating_point<value_type>::value)
                                              && (sizeof(value_type) <= 2))>>
-    : scan_config_v2<256,
-                     18,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::reduce_then_scan>
+    : scan_config<256,
+                  18,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on value_type = int64_t
@@ -570,11 +570,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_config_v2<256,
-                     6,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  6,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int
@@ -584,11 +584,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_config_v2<256,
-                     14,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  14,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = short
@@ -598,11 +598,11 @@ struct default_scan_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_config_v2<128,
-                     14,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<128,
+                  14,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on value_type = int8_t
@@ -611,11 +611,11 @@ struct default_scan_config<static_cast<unsigned int>(target_arch::gfx90a),
                            value_type,
                            std::enable_if_t<(!bool(rocprim::is_floating_point<value_type>::value)
                                              && (sizeof(value_type) <= 1))>>
-    : scan_config_v2<256,
-                     24,
-                     ::rocprim::block_load_method::block_load_transpose,
-                     ::rocprim::block_store_method::block_store_transpose,
-                     block_scan_algorithm::using_warp_scan>
+    : scan_config<256,
+                  24,
+                  ::rocprim::block_load_method::block_load_transpose,
+                  ::rocprim::block_store_method::block_store_transpose,
+                  block_scan_algorithm::using_warp_scan>
 {};
 
 } // end namespace detail
diff --git a/rocprim/include/rocprim/device/detail/config/device_scan_by_key.hpp b/rocprim/include/rocprim/device/detail/config/device_scan_by_key.hpp
index 90ae38b42..f6f375f24 100644
--- a/rocprim/include/rocprim/device/detail/config/device_scan_by_key.hpp
+++ b/rocprim/include/rocprim/device/detail/config/device_scan_by_key.hpp
@@ -51,11 +51,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = int
@@ -67,11 +67,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = double, value_type = short
@@ -83,11 +83,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = int8_t
@@ -98,11 +98,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = int64_t
@@ -114,11 +114,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<128,
-                            10,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         10,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = float, value_type = int
@@ -130,11 +130,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            12,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         12,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = short
@@ -146,11 +146,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = int8_t
@@ -161,11 +161,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<128,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int64_t
@@ -176,11 +176,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int
@@ -191,11 +191,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = short
@@ -206,11 +206,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int8_t
@@ -221,11 +221,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int64_t
@@ -237,11 +237,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int
@@ -253,11 +253,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int64_t, value_type = short
@@ -269,11 +269,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int8_t
@@ -284,11 +284,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int, value_type = int64_t
@@ -300,11 +300,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<128,
-                            10,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         10,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int, value_type = int
@@ -316,11 +316,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            12,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         12,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = short
@@ -332,11 +332,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = int8_t
@@ -347,11 +347,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<128,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = int64_t
@@ -363,11 +363,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = short, value_type = int
@@ -379,11 +379,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = short
@@ -395,11 +395,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = short, value_type = int8_t
@@ -410,11 +410,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int64_t
@@ -425,11 +425,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int
@@ -440,11 +440,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<128,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = short
@@ -455,11 +455,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int8_t
@@ -470,11 +470,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = double, value_type = int64_t
@@ -486,11 +486,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = double, value_type = int
@@ -502,11 +502,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<128,
-                            19,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<128,
+                         19,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = double, value_type = short
@@ -518,11 +518,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<128,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<128,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = double, value_type = int8_t
@@ -533,11 +533,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<128,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = float, value_type = int64_t
@@ -549,11 +549,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            10,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         10,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = int
@@ -565,11 +565,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            12,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         12,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = short
@@ -581,11 +581,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = int8_t
@@ -596,11 +596,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int64_t
@@ -611,11 +611,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int
@@ -626,11 +626,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = short
@@ -641,11 +641,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int8_t
@@ -656,11 +656,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int64_t
@@ -672,11 +672,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int
@@ -688,11 +688,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<128,
-                            19,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<128,
+                         19,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int64_t, value_type = short
@@ -704,11 +704,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<128,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<128,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int8_t
@@ -719,11 +719,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<128,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int, value_type = int64_t
@@ -735,11 +735,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            10,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         10,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int, value_type = int
@@ -751,11 +751,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            12,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         12,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = short
@@ -767,11 +767,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = int8_t
@@ -782,11 +782,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = int64_t
@@ -798,11 +798,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = int
@@ -814,11 +814,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = short
@@ -830,11 +830,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = short, value_type = int8_t
@@ -845,11 +845,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int64_t
@@ -860,11 +860,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int
@@ -875,11 +875,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            17,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         17,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = short
@@ -890,11 +890,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int8_t
@@ -905,11 +905,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = int64_t
@@ -921,11 +921,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = int
@@ -937,11 +937,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = short
@@ -953,11 +953,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            23,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         23,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = int8_t
@@ -968,11 +968,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = float, value_type = int64_t
@@ -984,11 +984,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            15,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         15,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = int
@@ -1000,11 +1000,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            15,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         15,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = short
@@ -1016,11 +1016,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = float, value_type = int8_t
@@ -1031,11 +1031,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int64_t
@@ -1046,11 +1046,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int
@@ -1061,11 +1061,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = short
@@ -1076,11 +1076,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int8_t
@@ -1091,11 +1091,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int64_t
@@ -1107,11 +1107,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int
@@ -1123,11 +1123,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = short
@@ -1139,11 +1139,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            23,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         23,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int8_t
@@ -1154,11 +1154,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int, value_type = int64_t
@@ -1170,11 +1170,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            15,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         15,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = int
@@ -1186,11 +1186,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            15,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         15,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = short
@@ -1202,11 +1202,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int, value_type = int8_t
@@ -1217,11 +1217,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = int64_t
@@ -1233,11 +1233,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = int
@@ -1249,11 +1249,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = short
@@ -1265,11 +1265,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = int8_t
@@ -1280,11 +1280,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int64_t
@@ -1295,11 +1295,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int
@@ -1310,11 +1310,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            13,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         13,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = short
@@ -1325,11 +1325,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int8_t
@@ -1340,11 +1340,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = double, value_type = int64_t
@@ -1356,11 +1356,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<128,
-                            23,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         23,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = int
@@ -1372,11 +1372,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<128,
-                            23,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<128,
+                         23,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = double, value_type = short
@@ -1388,11 +1388,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = int8_t
@@ -1403,11 +1403,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            17,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         17,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = float, value_type = int64_t
@@ -1419,11 +1419,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            9,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         9,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = float, value_type = int
@@ -1435,11 +1435,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            15,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         15,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = float, value_type = short
@@ -1451,11 +1451,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = int8_t
@@ -1466,11 +1466,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<128,
-                            7,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<128,
+                         7,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int64_t
@@ -1481,11 +1481,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            23,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         23,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int
@@ -1496,11 +1496,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<128,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = short
@@ -1511,11 +1511,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<128,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int8_t
@@ -1526,11 +1526,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int64_t
@@ -1542,11 +1542,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            13,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         13,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int
@@ -1558,11 +1558,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<128,
-                            19,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         19,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = short
@@ -1574,11 +1574,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int8_t
@@ -1589,11 +1589,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = int64_t
@@ -1605,11 +1605,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            9,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         9,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int, value_type = int
@@ -1621,11 +1621,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<128,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<128,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = short
@@ -1637,11 +1637,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = int8_t
@@ -1652,11 +1652,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<128,
-                            7,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<128,
+                         7,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = short, value_type = int64_t
@@ -1668,11 +1668,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            9,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         9,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = int
@@ -1684,11 +1684,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<128,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = short
@@ -1700,11 +1700,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<128,
-                            22,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         22,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = int8_t
@@ -1715,11 +1715,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int64_t
@@ -1730,11 +1730,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<128,
-                            17,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<128,
+                         17,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int
@@ -1745,11 +1745,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            12,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         12,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = short
@@ -1760,11 +1760,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<128,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int8_t
@@ -1775,11 +1775,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = int64_t
@@ -1791,11 +1791,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = int
@@ -1807,11 +1807,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = double, value_type = short
@@ -1823,11 +1823,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = int8_t
@@ -1838,11 +1838,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = int64_t
@@ -1854,11 +1854,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<128,
-                            10,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         10,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = float, value_type = int
@@ -1870,11 +1870,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            12,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         12,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = short
@@ -1886,11 +1886,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = int8_t
@@ -1901,11 +1901,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<128,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int64_t
@@ -1916,11 +1916,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int
@@ -1931,11 +1931,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = short
@@ -1946,11 +1946,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int8_t
@@ -1961,11 +1961,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int64_t
@@ -1977,11 +1977,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int
@@ -1993,11 +1993,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int64_t, value_type = short
@@ -2009,11 +2009,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int8_t
@@ -2024,11 +2024,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int, value_type = int64_t
@@ -2040,11 +2040,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<128,
-                            10,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         10,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int, value_type = int
@@ -2056,11 +2056,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            12,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         12,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = short
@@ -2072,11 +2072,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = int8_t
@@ -2087,11 +2087,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<128,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = int64_t
@@ -2103,11 +2103,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = short, value_type = int
@@ -2119,11 +2119,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = short
@@ -2135,11 +2135,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = short, value_type = int8_t
@@ -2150,11 +2150,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int64_t
@@ -2165,11 +2165,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int
@@ -2180,11 +2180,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<128,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = short
@@ -2195,11 +2195,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int8_t
@@ -2210,11 +2210,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = double, value_type = int64_t
@@ -2226,11 +2226,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = int
@@ -2242,11 +2242,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = double, value_type = short
@@ -2258,11 +2258,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = double, value_type = int8_t
@@ -2273,11 +2273,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<64,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = int64_t
@@ -2289,11 +2289,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<128,
-                            10,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         10,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = float, value_type = int
@@ -2305,11 +2305,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            12,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         12,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = short
@@ -2321,11 +2321,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = float, value_type = int8_t
@@ -2336,11 +2336,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<128,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int64_t
@@ -2351,11 +2351,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int
@@ -2366,11 +2366,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = short
@@ -2381,11 +2381,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = rocprim::half, value_type = int8_t
@@ -2396,11 +2396,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int64_t
@@ -2412,11 +2412,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int
@@ -2428,11 +2428,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int64_t, value_type = short
@@ -2444,11 +2444,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<64,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int64_t, value_type = int8_t
@@ -2459,11 +2459,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 8)
                       && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int, value_type = int64_t
@@ -2475,11 +2475,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<128,
-                            10,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         10,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int, value_type = int
@@ -2491,11 +2491,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            12,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         12,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = short
@@ -2507,11 +2507,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int, value_type = int8_t
@@ -2522,11 +2522,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 4)
                       && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<128,
-                            14,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         14,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = int64_t
@@ -2538,11 +2538,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 8)
                       && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = short, value_type = int
@@ -2554,11 +2554,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 4)
                       && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<256,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = short, value_type = short
@@ -2570,11 +2570,11 @@ struct default_scan_by_key_config<
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 2)
                       && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = short, value_type = int8_t
@@ -2585,11 +2585,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 2)
                       && (sizeof(key_type) > 1) && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<256,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int64_t
@@ -2600,11 +2600,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>>
-    : scan_by_key_config_v2<64,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<64,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int
@@ -2615,11 +2615,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>>
-    : scan_by_key_config_v2<128,
-                            18,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::using_warp_scan>
+    : scan_by_key_config<128,
+                         18,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::using_warp_scan>
 {};
 
 // Based on key_type = int8_t, value_type = short
@@ -2630,11 +2630,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>>
-    : scan_by_key_config_v2<256,
-                            20,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         20,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 // Based on key_type = int8_t, value_type = int8_t
@@ -2645,11 +2645,11 @@ struct default_scan_by_key_config<
     value_type,
     std::enable_if_t<(!bool(rocprim::is_floating_point<key_type>::value) && (sizeof(key_type) <= 1)
                       && (sizeof(value_type) <= 1))>>
-    : scan_by_key_config_v2<256,
-                            24,
-                            ::rocprim::block_load_method::block_load_transpose,
-                            ::rocprim::block_store_method::block_store_transpose,
-                            block_scan_algorithm::reduce_then_scan>
+    : scan_by_key_config<256,
+                         24,
+                         ::rocprim::block_load_method::block_load_transpose,
+                         ::rocprim::block_store_method::block_store_transpose,
+                         block_scan_algorithm::reduce_then_scan>
 {};
 
 } // end namespace detail
diff --git a/rocprim/include/rocprim/device/detail/device_adjacent_difference.hpp b/rocprim/include/rocprim/device/detail/device_adjacent_difference.hpp
index fca5bc979..4a7592b5d 100644
--- a/rocprim/include/rocprim/device/detail/device_adjacent_difference.hpp
+++ b/rocprim/include/rocprim/device/detail/device_adjacent_difference.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -28,6 +28,7 @@
 #include "../../detail/various.hpp"
 
 #include "../../config.hpp"
+#include "device_config_helper.hpp"
 
 #include <hip/hip_runtime.h>
 
@@ -180,18 +181,25 @@ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void adjacent_difference_kernel_impl(
     using input_type  = typename std::iterator_traits<InputIt>::value_type;
     using output_type = typename std::iterator_traits<OutputIt>::value_type;
 
-    static constexpr unsigned int block_size       = Config::block_size;
-    static constexpr unsigned int items_per_thread = Config::items_per_thread;
+    static constexpr adjacent_difference_config_params params = device_params<Config>();
+
+    static constexpr unsigned int block_size = params.adjacent_difference_kernel_config.block_size;
+    static constexpr unsigned int items_per_thread
+        = params.adjacent_difference_kernel_config.items_per_thread;
     static constexpr unsigned int items_per_block  = block_size * items_per_thread;
 
     using block_load_type
-        = ::rocprim::block_load<input_type, block_size, items_per_thread, Config::load_method>;
-    using block_store_type
-        = ::rocprim::block_store<output_type, block_size, items_per_thread, Config::store_method>;
+        = ::rocprim::block_load<input_type, block_size, items_per_thread, params.block_load_method>;
+    using block_store_type = ::rocprim::
+        block_store<output_type, block_size, items_per_thread, params.block_store_method>;
 
     using adjacent_helper = adjacent_diff_helper<input_type, block_size>;
 
+#if defined(__gfx1102__) or defined(__gfx1030__)
     ROCPRIM_SHARED_MEMORY struct
+#else
+    ROCPRIM_SHARED_MEMORY union
+#endif
     {
         typename block_load_type::storage_type  load;
         typename adjacent_helper::storage_type  adjacent_diff;
diff --git a/rocprim/include/rocprim/device/detail/device_config_helper.hpp b/rocprim/include/rocprim/device/detail/device_config_helper.hpp
index 999df33cc..532b8d8cd 100644
--- a/rocprim/include/rocprim/device/detail/device_config_helper.hpp
+++ b/rocprim/include/rocprim/device/detail/device_config_helper.hpp
@@ -47,7 +47,6 @@ namespace detail
 struct merge_sort_block_sort_config_params
 {
     kernel_config_params block_sort_config = {0, 0};
-    block_sort_algorithm block_sort_method = block_sort_algorithm::stable_merge_sort;
 };
 
 // Necessary to construct a parameterized type of `merge_sort_block_sort_config_params`.
@@ -57,7 +56,7 @@ struct merge_sort_block_sort_config : rocprim::detail::merge_sort_block_sort_con
 {
     using sort_config = kernel_config<BlockSize, ItemsPerThread>;
     constexpr merge_sort_block_sort_config()
-        : rocprim::detail::merge_sort_block_sort_config_params{sort_config(), Algo} {};
+        : rocprim::detail::merge_sort_block_sort_config_params{sort_config()} {};
 };
 
 constexpr unsigned int merge_sort_items_per_thread(const unsigned int item_scale)
@@ -206,6 +205,9 @@ struct radix_sort_onesweep_config : detail::radix_sort_onesweep_config_params
 namespace detail
 {
 
+struct reduce_config_tag
+{};
+
 // Calculate kernel configurations, such that it will not exceed shared memory maximum
 template<class Key, class Value>
 struct radix_sort_onesweep_config_base
@@ -240,6 +242,8 @@ template<unsigned int                      BlockSize      = 256,
          unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
 struct reduce_config : rocprim::detail::reduce_config_params
 {
+    /// \brief Identifies the algorithm associated to the config.
+    using tag = detail::reduce_config_tag;
     constexpr reduce_config()
         : rocprim::detail::reduce_config_params{
             {BlockSize, ItemsPerThread, SizeLimit},
@@ -265,6 +269,9 @@ template<class Value>
 struct default_reduce_config_base : default_reduce_config_base_helper<Value>::type
 {};
 
+struct scan_config_tag
+{};
+
 /// \brief Provides the kernel parameters for exclusive_scan and inclusive_scan based
 ///        on autotuned configurations or user-provided configurations.
 struct scan_config_params
@@ -291,8 +298,10 @@ template<unsigned int                    BlockSize,
          ::rocprim::block_store_method   BlockStoreMethod,
          ::rocprim::block_scan_algorithm BlockScanMethod,
          unsigned int                    SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
-struct scan_config_v2 : ::rocprim::detail::scan_config_params
+struct scan_config : ::rocprim::detail::scan_config_params
 {
+    /// \brief Identifies the algorithm associated to the config.
+    using tag = detail::scan_config_tag;
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
     // Requirement dictated by init_lookback_scan_state_kernel.
     static_assert(BlockSize <= ROCPRIM_DEFAULT_MAX_BLOCK_SIZE,
@@ -311,54 +320,6 @@ struct scan_config_v2 : ::rocprim::detail::scan_config_params
     /// \brief Limit on the number of items for a single scan kernel launch.
     static constexpr unsigned int size_limit = SizeLimit;
 
-    constexpr scan_config_v2()
-        : ::rocprim::detail::scan_config_params{
-            {BlockSize, ItemsPerThread, SizeLimit},
-            BlockLoadMethod,
-            BlockStoreMethod,
-            BlockScanMethod
-    } {};
-#endif
-};
-
-/// \brief Deprecated: Configuration of device-level scan primitives.
-///
-/// \tparam BlockSize - number of threads in a block.
-/// \tparam ItemsPerThread - number of items processed by each thread.
-/// \tparam UseLookback - deprecated, scan always uses lookback scan.
-/// \tparam BlockLoadMethod - method for loading input values.
-/// \tparam StoreLoadMethod - method for storing values.
-/// \tparam BlockScanMethod - algorithm for block scan.
-/// \tparam SizeLimit - limit on the number of items for a single scan kernel launch.
-template<unsigned int                    BlockSize,
-         unsigned int                    ItemsPerThread,
-         bool                            UseLookback,
-         ::rocprim::block_load_method    BlockLoadMethod,
-         ::rocprim::block_store_method   BlockStoreMethod,
-         ::rocprim::block_scan_algorithm BlockScanMethod,
-         unsigned int                    SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
-struct
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen seems to have trouble with the syntax used in this definition
-[[deprecated("The UseLookback switch has been removed, as scan now only supports the "
-                    "lookback-scan implementation. Use scan_config_v2 instead.")]] 
-#endif
-scan_config : ::rocprim::detail::scan_config_params
-{
-    /// \brief Number of threads in a block.
-    static constexpr unsigned int block_size = BlockSize;
-    /// \brief Number of items processed by each thread.
-    static constexpr unsigned int items_per_thread = ItemsPerThread;
-    /// \brief Whether to use lookback scan or reduce-then-scan algorithm.
-    static constexpr bool use_lookback = UseLookback;
-    /// \brief Method for loading input values.
-    static constexpr ::rocprim::block_load_method block_load_method = BlockLoadMethod;
-    /// \brief Method for storing values.
-    static constexpr ::rocprim::block_store_method block_store_method = BlockStoreMethod;
-    /// \brief Algorithm for block scan.
-    static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod;
-    /// \brief Limit on the number of items for a single scan kernel launch.
-    static constexpr unsigned int size_limit = SizeLimit;
-
     constexpr scan_config()
         : ::rocprim::detail::scan_config_params{
             {BlockSize, ItemsPerThread, SizeLimit},
@@ -366,22 +327,26 @@ scan_config : ::rocprim::detail::scan_config_params
             BlockStoreMethod,
             BlockScanMethod
     } {};
+#endif
 };
 
 namespace detail
 {
 
+struct scan_by_key_config_tag
+{};
+
 template<class Value>
 struct default_scan_config_base_helper
 {
     static constexpr unsigned int item_scale
         = ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
 
-    using type = scan_config_v2<limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
-                                ::rocprim::max(1u, 16u / item_scale),
-                                ::rocprim::block_load_method::block_load_transpose,
-                                ::rocprim::block_store_method::block_store_transpose,
-                                ::rocprim::block_scan_algorithm::using_warp_scan>;
+    using type = scan_config<limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+                             ::rocprim::max(1u, 16u / item_scale),
+                             ::rocprim::block_load_method::block_load_transpose,
+                             ::rocprim::block_store_method::block_store_transpose,
+                             ::rocprim::block_scan_algorithm::using_warp_scan>;
 };
 
 template<class Value>
@@ -414,8 +379,10 @@ template<unsigned int                    BlockSize,
          ::rocprim::block_store_method   BlockStoreMethod,
          ::rocprim::block_scan_algorithm BlockScanMethod,
          unsigned int                    SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
-struct scan_by_key_config_v2 : ::rocprim::detail::scan_by_key_config_params
+struct scan_by_key_config : ::rocprim::detail::scan_by_key_config_params
 {
+    /// \brief Identifies the algorithm associated to the config.
+    using tag = detail::scan_by_key_config_tag;
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
     // Requirement dictated by init_lookback_scan_state_kernel.
     static_assert(BlockSize <= ROCPRIM_DEFAULT_MAX_BLOCK_SIZE,
@@ -434,55 +401,6 @@ struct scan_by_key_config_v2 : ::rocprim::detail::scan_by_key_config_params
     /// \brief Limit on the number of items for a single scan kernel launch.
     static constexpr unsigned int size_limit = SizeLimit;
 
-    constexpr scan_by_key_config_v2()
-        : ::rocprim::detail::scan_by_key_config_params{
-            {BlockSize, ItemsPerThread, SizeLimit},
-            BlockLoadMethod,
-            BlockStoreMethod,
-            BlockScanMethod
-    } {};
-#endif
-};
-
-/// \brief Deprecated: Configuration of device-level scan-by-key operation.
-///
-/// \tparam BlockSize - number of threads in a block.
-/// \tparam ItemsPerThread - number of items processed by each thread.
-/// \tparam UseLookback - deprecated, scan always uses lookback scan.
-/// \tparam BlockLoadMethod - method for loading input values.
-/// \tparam StoreLoadMethod - method for storing values.
-/// \tparam BlockScanMethod - algorithm for block scan.
-/// \tparam SizeLimit - limit on the number of items for a single scan kernel launch.
-template<unsigned int                    BlockSize,
-         unsigned int                    ItemsPerThread,
-         bool                            UseLookback,
-         ::rocprim::block_load_method    BlockLoadMethod,
-         ::rocprim::block_store_method   BlockStoreMethod,
-         ::rocprim::block_scan_algorithm BlockScanMethod,
-         unsigned int                    SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
-struct
-#ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen seems to have trouble with the syntax used in this definition
-[[deprecated(
-    "The UseLookback switch has been removed, as scan now only supports the lookback-scan "
-    "implementation. Use scan_by_key_config_v2 instead.")]]
-#endif
-scan_by_key_config : ::rocprim::detail::scan_by_key_config_params
-{
-    /// \brief Number of threads in a block.
-    static constexpr unsigned int block_size = BlockSize;
-    /// \brief Number of items processed by each thread.
-    static constexpr unsigned int items_per_thread = ItemsPerThread;
-    /// \brief Whether to use lookback scan or reduce-then-scan algorithm.
-    static constexpr bool use_lookback = UseLookback;
-    /// \brief Method for loading input values.
-    static constexpr ::rocprim::block_load_method block_load_method = BlockLoadMethod;
-    /// \brief Method for storing values.
-    static constexpr ::rocprim::block_store_method block_store_method = BlockStoreMethod;
-    /// \brief Algorithm for block scan.
-    static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod;
-    /// \brief Limit on the number of items for a single scan kernel launch.
-    static constexpr unsigned int size_limit = SizeLimit;
-
     constexpr scan_by_key_config()
         : ::rocprim::detail::scan_by_key_config_params{
             {BlockSize, ItemsPerThread, SizeLimit},
@@ -490,6 +408,7 @@ scan_by_key_config : ::rocprim::detail::scan_by_key_config_params
             BlockStoreMethod,
             BlockScanMethod
     } {};
+#endif
 };
 
 namespace detail
@@ -501,7 +420,7 @@ struct default_scan_by_key_config_base_helper
     static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div<unsigned int>(
         sizeof(Key) + sizeof(Value), 2 * sizeof(int));
 
-    using type = scan_by_key_config_v2<
+    using type = scan_by_key_config<
         limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
         ::rocprim::max(1u, 16u / item_scale),
         ::rocprim::block_load_method::block_load_transpose,
@@ -513,6 +432,9 @@ template<class Key, class Value>
 struct default_scan_by_key_config_base : default_scan_by_key_config_base_helper<Key, Value>::type
 {};
 
+struct transform_config_tag
+{};
+
 struct transform_config_params
 {
     kernel_config_params kernel_config{};
@@ -527,8 +449,10 @@ struct transform_config_params
 template<unsigned int BlockSize,
          unsigned int ItemsPerThread,
          unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
-struct transform_config
+struct transform_config : public detail::transform_config_params
 {
+    /// \brief Identifies the algorithm associated to the config.
+    using tag = detail::transform_config_tag;
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
 
     /// \brief Number of threads in a block.
@@ -540,6 +464,11 @@ struct transform_config
     /// \brief Limit on the number of items for a single kernel launch.
     static constexpr unsigned int size_limit = SizeLimit;
 
+    constexpr transform_config()
+        : detail::transform_config_params{
+            {BlockSize, ItemsPerThread, SizeLimit}
+    }
+    {}
 #endif
 };
 
@@ -559,6 +488,13 @@ template<class Value>
 struct default_transform_config_base : default_transform_config_base_helper<Value>::type
 {};
 
+struct binary_search_config_tag : public transform_config_tag
+{};
+struct upper_bound_config_tag : public transform_config_tag
+{};
+struct lower_bound_config_tag : public transform_config_tag
+{};
+
 } // namespace detail
 
 /// \brief Configuration for the device-level binary search operation.
@@ -569,7 +505,10 @@ template<unsigned int BlockSize,
          unsigned int ItemsPerThread,
          unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
 struct binary_search_config : transform_config<BlockSize, ItemsPerThread, SizeLimit>
-{};
+{
+    /// \brief Identifies the algorithm associated to the config.
+    using tag = detail::binary_search_config_tag;
+};
 
 /// \brief Configuration for the device-level upper bound operation.
 /// \tparam BlockSize Number of threads in a block.
@@ -579,7 +518,10 @@ template<unsigned int BlockSize,
          unsigned int ItemsPerThread,
          unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
 struct upper_bound_config : transform_config<BlockSize, ItemsPerThread, SizeLimit>
-{};
+{
+    /// \brief Identifies the algorithm associated to the config.
+    using tag = detail::upper_bound_config_tag;
+};
 
 /// \brief Configuration for the device-level lower bound operation.
 /// \tparam BlockSize Number of threads in a block.
@@ -589,11 +531,17 @@ template<unsigned int BlockSize,
          unsigned int ItemsPerThread,
          unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
 struct lower_bound_config : transform_config<BlockSize, ItemsPerThread, SizeLimit>
-{};
+{
+    /// \brief Identifies the algorithm associated to the config.
+    using tag = detail::lower_bound_config_tag;
+};
 
 namespace detail
 {
 
+struct histogram_config_tag
+{};
+
 template<class Value, class Output>
 struct default_binary_search_config_base
     : binary_search_config<
@@ -630,6 +578,8 @@ template<class HistogramConfig,
          unsigned int SharedImplHistograms = 3>
 struct histogram_config : detail::histogram_config_params
 {
+    /// \brief Identifies the algorithm associated to the config.
+    using tag = detail::histogram_config_tag;
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
     using histogram = HistogramConfig;
 
@@ -661,6 +611,69 @@ struct default_histogram_config_base
     : default_histogram_config_base_helper<Sample, Channels, ActiveChannels>::type
 {};
 
+struct adjacent_difference_config_tag
+{};
+
+struct adjacent_difference_config_params
+{
+    kernel_config_params          adjacent_difference_kernel_config;
+    ::rocprim::block_load_method  block_load_method;
+    ::rocprim::block_store_method block_store_method;
+};
+} // namespace detail
+
+/// \brief Configuration of device-level adjacent difference primitives.
+///
+/// \tparam BlockSize - number of threads in a block.
+/// \tparam ItemsPerThread - number of items processed by each thread.
+/// \tparam BlockLoadMethod - method for loading input values.
+/// \tparam BlockStoreMethod - method for storing values.
+/// \tparam SizeLimit - limit on the number of items for a single adjacent difference kernel launch.
+template<unsigned int       BlockSize,
+         unsigned int       ItemsPerThread,
+         block_load_method  BlockLoadMethod  = block_load_method::block_load_transpose,
+         block_store_method BlockStoreMethod = block_store_method::block_store_transpose,
+         unsigned int       SizeLimit        = ROCPRIM_GRID_SIZE_LIMIT>
+struct adjacent_difference_config : public detail::adjacent_difference_config_params
+{
+    /// \brief Identifies the algorithm associated to the config.
+    using tag = detail::adjacent_difference_config_tag;
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+    static constexpr ::rocprim::block_load_method  block_load_method  = BlockLoadMethod;
+    static constexpr ::rocprim::block_store_method block_store_method = BlockStoreMethod;
+    static constexpr unsigned int                  block_size         = BlockSize;
+    static constexpr unsigned int                  items_per_thread   = ItemsPerThread;
+    static constexpr unsigned int                  size_limit         = SizeLimit;
+
+    constexpr adjacent_difference_config()
+        : detail::adjacent_difference_config_params{
+            {BlockSize, ItemsPerThread, SizeLimit},
+            BlockLoadMethod, BlockStoreMethod
+    } {};
+#endif
+};
+
+namespace detail
+{
+
+template<class Value>
+struct default_adjacent_difference_config_base_helper
+{
+    static constexpr unsigned int item_scale
+        = ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+
+    using type = adjacent_difference_config<
+        limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value,
+        ::rocprim::max(1u, 16u / item_scale),
+        ::rocprim::block_load_method::block_load_transpose,
+        ::rocprim::block_store_method::block_store_transpose>;
+};
+
+template<class Value>
+struct default_adjacent_difference_config_base
+    : default_adjacent_difference_config_base_helper<Value>::type
+{};
+
 } // namespace detail
 
 END_ROCPRIM_NAMESPACE
diff --git a/rocprim/include/rocprim/device/detail/device_histogram.hpp b/rocprim/include/rocprim/device/detail/device_histogram.hpp
index 27453b3ff..6798e2be9 100644
--- a/rocprim/include/rocprim/device/detail/device_histogram.hpp
+++ b/rocprim/include/rocprim/device/detail/device_histogram.hpp
@@ -502,14 +502,12 @@ ROCPRIM_DEVICE ROCPRIM_INLINE void
                     const lane_mask_type bit_set_mask = ::rocprim::ballot(bit_set);
                     same_bin_lanes_mask &= (bit_set ? bit_set_mask : ~bit_set_mask);
                 }
-                const unsigned int same_bin_count = ::rocprim::bit_count(same_bin_lanes_mask);
-                const unsigned int prev_same_bin_count
-                    = ::rocprim::masked_bit_count(same_bin_lanes_mask);
-                if(prev_same_bin_count == 0)
+                if(::rocprim::group_elect(same_bin_lanes_mask))
                 {
                     // Write the number of lanes having this bin,
                     // if the current lane is the first (and maybe only) lane with this bin.
-                    ::rocprim::detail::atomic_add(&histogram[channel][bin], same_bin_count);
+                    ::rocprim::detail::atomic_add(&histogram[channel][bin],
+                                                  ::rocprim::bit_count(same_bin_lanes_mask));
                 }
             }
         }
diff --git a/rocprim/include/rocprim/device/detail/device_merge_sort.hpp b/rocprim/include/rocprim/device/detail/device_merge_sort.hpp
index b28d9c2c2..eeefcaee9 100644
--- a/rocprim/include/rocprim/device/detail/device_merge_sort.hpp
+++ b/rocprim/include/rocprim/device/detail/device_merge_sort.hpp
@@ -318,145 +318,13 @@ struct block_permute_values_impl<Value,
 
 template<typename Key,
          typename Value,
-         unsigned int         BlockSize,
-         unsigned int         ItemsPerThread,
-         block_sort_algorithm Algo,
+         unsigned int BlockSize,
+         unsigned int ItemsPerThread,
          typename Enable = void>
-struct block_sort_impl
-{
-    using stable_key_type = rocprim::tuple<Key, unsigned int>;
-
-    using keys_load_type
-        = block_load<Key, BlockSize, ItemsPerThread, block_load_method::block_load_transpose>;
-
-    using sort_type
-        = block_sort<stable_key_type, BlockSize, ItemsPerThread, rocprim::empty_type, Algo>;
-
-    using keys_store_type
-        = block_store<Key, BlockSize, ItemsPerThread, block_store_method::block_store_transpose>;
-
-    using values_permute_type = block_permute_values_impl<Value, BlockSize, ItemsPerThread>;
-
-    union storage_type
-    {
-        typename keys_load_type::storage_type      load_keys;
-        typename sort_type::storage_type           sort;
-        typename keys_store_type::storage_type     store_keys;
-        typename values_permute_type::storage_type permute_values;
-    };
-
-    template<typename KeysInputIterator,
-             typename KeysOutputIterator,
-             typename ValuesInputIterator,
-             typename ValuesOutputIterator,
-             typename BinaryFunction>
-    ROCPRIM_DEVICE  ROCPRIM_FORCE_INLINE
-    void sort(const unsigned int   valid_in_last_block,
-              const bool           is_incomplete_block,
-              KeysInputIterator    keys_input,
-              KeysOutputIterator   keys_output,
-              ValuesInputIterator  values_input,
-              ValuesOutputIterator values_output,
-              BinaryFunction       compare_function,
-              storage_type&        storage)
-    {
-        // By default, the block sort algorithm is not stable. We can make it stable
-        // by adding an index to each key.
-
-        Key keys[ItemsPerThread];
-
-        if(is_incomplete_block)
-        {
-            keys_load_type().load(keys_input, keys, valid_in_last_block, storage.load_keys);
-        }
-        else
-        {
-            keys_load_type().load(keys_input, keys, storage.load_keys);
-        }
-
-        const auto flat_id = block_thread_id<0>();
-
-        stable_key_type stable_keys[ItemsPerThread];
-        ROCPRIM_UNROLL
-        for(unsigned int i = 0; i < ItemsPerThread; ++i)
-        {
-            stable_keys[i] = rocprim::make_tuple(keys[i], flat_id * ItemsPerThread + i);
-        }
-
-        syncthreads();
-
-        // Special compare function that enforces sorting is stable.
-        auto stable_compare_function
-            = [compare_function](const stable_key_type& a,
-                                 const stable_key_type& b) ROCPRIM_FORCE_INLINE mutable
-        {
-            const bool ab = compare_function(rocprim::get<0>(a), rocprim::get<0>(b));
-            return ab
-                   || (!compare_function(rocprim::get<0>(b), rocprim::get<0>(a))
-                       && (rocprim::get<1>(a) < rocprim::get<1>(b)));
-        };
-
-        if(is_incomplete_block)
-        {
-            // Special compare function that enforces sorting is stable, and that out-of-bounds elements
-            // are not compared.
-            auto stable_oob_compare_function
-                = [stable_compare_function, valid_in_last_block](const stable_key_type& a,
-                                                                 const stable_key_type& b) mutable
-            {
-                const bool a_oob = rocprim::get<1>(a) >= valid_in_last_block;
-                const bool b_oob = rocprim::get<1>(b) >= valid_in_last_block;
-                return a_oob || b_oob ? !a_oob : stable_compare_function(a, b);
-            };
-
-            // Note: rocprim::block_sort with an algorithm that is not stable_merge_sort does not implement sorting
-            // a misaligned amount of items.
-            sort_type().sort(stable_keys, storage.sort, stable_oob_compare_function);
-
-            unsigned int ranks[ItemsPerThread];
-            ROCPRIM_UNROLL
-            for(unsigned int i = 0; i < ItemsPerThread; ++i)
-            {
-                keys[i]  = rocprim::get<0>(stable_keys[i]);
-                ranks[i] = rocprim::get<1>(stable_keys[i]);
-            }
-
-            syncthreads();
-            keys_store_type().store(keys_output, keys, valid_in_last_block, storage.store_keys);
-            values_permute_type().permute(ranks,
-                                          values_input,
-                                          values_output,
-                                          valid_in_last_block,
-                                          storage.permute_values);
-        }
-        else
-        {
-            sort_type().sort(stable_keys, storage.sort, stable_compare_function);
-
-            unsigned int ranks[ItemsPerThread];
-            ROCPRIM_UNROLL
-            for(unsigned int i = 0; i < ItemsPerThread; ++i)
-            {
-                keys[i]  = rocprim::get<0>(stable_keys[i]);
-                ranks[i] = rocprim::get<1>(stable_keys[i]);
-            }
-
-            syncthreads();
-            keys_store_type().store(keys_output, keys, storage.store_keys);
-            values_permute_type().permute(ranks,
-                                          values_input,
-                                          values_output,
-                                          storage.permute_values);
-        }
-    }
-};
+struct block_sort_impl;
 
 template<typename Key, unsigned int BlockSize, unsigned int ItemsPerThread>
-struct block_sort_impl<Key,
-                       rocprim::empty_type,
-                       BlockSize,
-                       ItemsPerThread,
-                       block_sort_algorithm::stable_merge_sort>
+struct block_sort_impl<Key, rocprim::empty_type, BlockSize, ItemsPerThread>
 {
     using keys_load_type
         = block_load<Key, BlockSize, ItemsPerThread, block_load_method::block_load_transpose>;
@@ -518,7 +386,6 @@ struct block_sort_impl<Key,
                        Value,
                        BlockSize,
                        ItemsPerThread,
-                       block_sort_algorithm::stable_merge_sort,
                        std::enable_if_t<(sizeof(Value) <= sizeof(int))>>
 {
     using keys_load_type
@@ -599,7 +466,6 @@ struct block_sort_impl<Key,
                        Value,
                        BlockSize,
                        ItemsPerThread,
-                       block_sort_algorithm::stable_merge_sort,
                        std::enable_if_t<(sizeof(Value) > sizeof(int))>>
 {
     using keys_load_type
@@ -677,9 +543,8 @@ struct block_sort_impl<Key,
 };
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
-template<unsigned int         BlockSize,
-         unsigned int         ItemsPerThread,
-         block_sort_algorithm Algo,
+template<unsigned int BlockSize,
+         unsigned int ItemsPerThread,
          class KeysInputIterator,
          class KeysOutputIterator,
          class ValuesInputIterator,
@@ -704,7 +569,7 @@ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE auto block_sort_kernel_impl(KeysInputIterato
     const unsigned int valid_in_last_block = input_size - block_offset;
     const bool         is_incomplete_block = flat_block_id == (input_size / items_per_block);
 
-    using sort_impl = block_sort_impl<key_type, value_type, BlockSize, ItemsPerThread, Algo>;
+    using sort_impl = block_sort_impl<key_type, value_type, BlockSize, ItemsPerThread>;
 
     ROCPRIM_SHARED_MEMORY typename sort_impl::storage_type storage;
 
diff --git a/rocprim/include/rocprim/device/detail/device_radix_sort.hpp b/rocprim/include/rocprim/device/detail/device_radix_sort.hpp
index 2d8800876..bbcff597b 100644
--- a/rocprim/include/rocprim/device/detail/device_radix_sort.hpp
+++ b/rocprim/include/rocprim/device/detail/device_radix_sort.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -163,20 +163,16 @@ struct radix_digit_count_helper
                 const bit_key_type bit_key = key_codec::encode(keys[i]);
                 const unsigned int digit = key_codec::extract_digit(bit_key, bit, current_radix_bits);
                 const unsigned int pos = i * BlockSize + flat_id;
-                lane_mask_type same_digit_lanes_mask = ::rocprim::ballot(IsFull || (pos < valid_count));
-                for(unsigned int b = 0; b < RadixBits; b++)
-                {
-                    const unsigned int bit_set = digit & (1u << b);
-                    const lane_mask_type bit_set_mask = ::rocprim::ballot(bit_set);
-                    same_digit_lanes_mask &= (bit_set ? bit_set_mask : ~bit_set_mask);
-                }
-                const unsigned int same_digit_count = ::rocprim::bit_count(same_digit_lanes_mask);
-                const unsigned int prev_same_digit_count = ::rocprim::masked_bit_count(same_digit_lanes_mask);
-                if(prev_same_digit_count == 0)
+
+                lane_mask_type same_digit_lanes_mask
+                    = ::rocprim::match_any<RadixBits>(digit, IsFull || (pos < valid_count));
+
+                if(::rocprim::group_elect(same_digit_lanes_mask))
                 {
                     // Write the number of lanes having this digit,
                     // if the current lane is the first (and maybe only) lane with this digit.
-                    storage.digit_counts[warp_id][digit] += same_digit_count;
+                    storage.digit_counts[warp_id][digit]
+                        += ::rocprim::bit_count(same_digit_lanes_mask);
                 }
             }
         }
@@ -1194,17 +1190,18 @@ template<unsigned int               BlockSize,
          class ValuesInputIterator,
          class ValuesOutputIterator,
          class Offset>
-ROCPRIM_DEVICE void onesweep_iteration(KeysInputIterator        keys_input,
-                                       KeysOutputIterator       keys_output,
-                                       ValuesInputIterator      values_input,
-                                       ValuesOutputIterator     values_output,
-                                       const unsigned int       size,
-                                       Offset*                  global_digit_offsets_in,
-                                       Offset*                  global_digit_offsets_out,
-                                       onesweep_lookback_state* lookback_states,
-                                       const unsigned int       bit,
-                                       const unsigned int       current_radix_bits,
-                                       const unsigned int       full_blocks)
+ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void
+    onesweep_iteration(KeysInputIterator        keys_input,
+                       KeysOutputIterator       keys_output,
+                       ValuesInputIterator      values_input,
+                       ValuesOutputIterator     values_output,
+                       const unsigned int       size,
+                       Offset*                  global_digit_offsets_in,
+                       Offset*                  global_digit_offsets_out,
+                       onesweep_lookback_state* lookback_states,
+                       const unsigned int       bit,
+                       const unsigned int       current_radix_bits,
+                       const unsigned int       full_blocks)
 {
     using key_type   = typename std::iterator_traits<KeysInputIterator>::value_type;
     using value_type = typename std::iterator_traits<ValuesInputIterator>::value_type;
diff --git a/rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp b/rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp
index 3fd9f8a65..16bc5b866 100644
--- a/rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp
+++ b/rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -31,6 +31,7 @@
 #include "../../detail/match_result_type.hpp"
 #include "../../detail/various.hpp"
 #include "../../intrinsics/thread.hpp"
+#include "../../thread/thread_operators.hpp"
 
 #include "../../config.hpp"
 
@@ -59,6 +60,36 @@ template<typename AccumulatorType, bool UseSleep = false>
 using lookback_scan_state_t
     = detail::lookback_scan_state<wrapped_type_t<AccumulatorType>, UseSleep>;
 
+template<typename EqualityOp>
+struct guarded_inequality_wrapper
+{
+    /// Wrapped equality operator
+    EqualityOp op;
+
+    /// Out-of-bounds limit
+    size_t guard;
+
+    /// Constructor
+    ROCPRIM_HOST_DEVICE inline guarded_inequality_wrapper(EqualityOp op, size_t guard)
+        : op(op), guard(guard)
+    {}
+
+    /// \brief Guarded boolean inequality operator.
+    ///
+    /// \tparam T Type of the operands compared by the equality operator
+    /// \param a Left hand-side operand
+    /// \param b Right hand-side operand
+    /// \param idx Index of the thread calling to this operator. This is used to determine which
+    /// operations are out-of-bounds
+    /// \returns <tt>!op(a, b)</tt> for a certain equality operator \p op when in-bounds.
+    template<typename T>
+    ROCPRIM_HOST_DEVICE inline bool operator()(const T& a, const T& b, size_t idx) const
+    {
+        // In-bounds return operation result, out-of-bounds return false.
+        return (idx < guard) ? !op(a, b) : 0;
+    }
+};
+
 template<typename KeyType,
          typename AccumulatorType,
          unsigned int      BlockSize,
@@ -93,14 +124,8 @@ struct load_helper
         }
         else
         {
-            // Pad with the last valid value so out-of-bound items are not flagged
-            block_load_keys{}.load(tile_keys,
-                                   keys,
-                                   valid_in_global_last_tile,
-                                   tile_keys[valid_in_global_last_tile - 1],
-                                   storage.keys);
+            block_load_keys{}.load(tile_keys, keys, valid_in_global_last_tile, storage.keys);
             ::rocprim::syncthreads();
-
             block_load_values{}.load(tile_values,
                                      values,
                                      valid_in_global_last_tile,
@@ -121,22 +146,47 @@ struct discontinuity_helper
                                    CompareFunction compare,
                                    unsigned int (&head_flags)[ItemsPerThread],
                                    const bool    is_global_first_tile,
+                                   const bool    is_global_last_tile,
+                                   const size_t  remaining,
                                    storage_type& storage)
     {
-        auto not_equal = [compare](const auto& a, const auto& b) mutable { return !compare(a, b); };
-
-        if(!is_global_first_tile)
+        if(is_global_last_tile)
         {
-            const KeyType tile_predecessor = tile_keys[-1];
-            block_discontinuity_type{}.flag_heads(head_flags,
-                                                  tile_predecessor,
-                                                  keys,
-                                                  not_equal,
-                                                  storage);
+            // If it's the last tile globally, the out-of-bound items should not be flagged.
+            auto guarded_not_equal
+                = guarded_inequality_wrapper<CompareFunction>(compare, remaining);
+
+            if(!is_global_first_tile)
+            {
+                const KeyType tile_predecessor = tile_keys[-1];
+                block_discontinuity_type{}.flag_heads(head_flags,
+                                                      tile_predecessor,
+                                                      keys,
+                                                      guarded_not_equal,
+                                                      storage);
+            }
+            else
+            {
+                block_discontinuity_type{}.flag_heads(head_flags, keys, guarded_not_equal, storage);
+            }
         }
         else
         {
-            block_discontinuity_type{}.flag_heads(head_flags, keys, not_equal, storage);
+            auto not_equal = rocprim::inequality_wrapper<CompareFunction>(compare);
+
+            if(!is_global_first_tile)
+            {
+                const KeyType tile_predecessor = tile_keys[-1];
+                block_discontinuity_type{}.flag_heads(head_flags,
+                                                      tile_predecessor,
+                                                      keys,
+                                                      not_equal,
+                                                      storage);
+            }
+            else
+            {
+                block_discontinuity_type{}.flag_heads(head_flags, keys, not_equal, storage);
+            }
         }
     }
 };
@@ -270,8 +320,11 @@ class tile_helper
         // first tile in this launch
         const bool is_first_tile = tile_id == 0;
 
+        // When in last tile valid_in_global_last_tile = remaining
         const unsigned int valid_in_global_last_tile
             = static_cast<unsigned int>(size - ((total_number_of_tiles - 1) * items_per_tile));
+        const size_t remaining
+            = static_cast<size_t>(size - (size_t{global_tile_id} * items_per_tile));
 
         const unsigned int flat_thread_id = threadIdx.x;
 
@@ -293,6 +346,8 @@ class tile_helper
                                         compare,
                                         head_flags,
                                         is_global_first_tile,
+                                        is_global_last_tile,
+                                        remaining,
                                         storage.scan.flags);
 
         wrapped_type wrapped_values[ItemsPerThread];
diff --git a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp
index f01f7347a..9287e5b21 100644
--- a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp
+++ b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -35,6 +35,8 @@
 #include "../../detail/temp_storage.hpp"
 #include "../../detail/various.hpp"
 
+#include "../config_types.hpp"
+
 extern "C"
 {
     void __builtin_amdgcn_s_sleep(int);
@@ -98,26 +100,37 @@ struct lookback_scan_state<T, UseSleep, true>
     using value_type = T;
 
     // temp_storage must point to allocation of get_storage_size(number_of_blocks) bytes
-    ROCPRIM_HOST static inline
-    lookback_scan_state create(void* temp_storage, const unsigned int number_of_blocks)
+    ROCPRIM_HOST static inline hipError_t create(lookback_scan_state& state,
+                                                 void*                temp_storage,
+                                                 const unsigned int   number_of_blocks,
+                                                 const hipStream_t /*stream*/)
     {
-        (void) number_of_blocks;
-        lookback_scan_state state;
+        (void)number_of_blocks;
         state.prefixes = reinterpret_cast<prefix_underlying_type*>(temp_storage);
-        return state;
+        return hipSuccess;
     }
 
-    ROCPRIM_HOST static inline
-    size_t get_storage_size(const unsigned int number_of_blocks)
+    ROCPRIM_HOST static inline hipError_t get_storage_size(const unsigned int number_of_blocks,
+                                                           const hipStream_t  stream,
+                                                           size_t&            storage_size)
     {
-        return sizeof(prefix_underlying_type) * (::rocprim::host_warp_size() + number_of_blocks);
+        unsigned int warp_size;
+        hipError_t   error = ::rocprim::host_warp_size(stream, warp_size);
+
+        storage_size = sizeof(prefix_underlying_type) * (warp_size + number_of_blocks);
+
+        return error;
     }
 
-    ROCPRIM_HOST static inline detail::temp_storage::layout
-        get_temp_storage_layout(const unsigned int number_of_blocks)
+    ROCPRIM_HOST static inline hipError_t
+        get_temp_storage_layout(const unsigned int            number_of_blocks,
+                                const hipStream_t             stream,
+                                detail::temp_storage::layout& layout)
     {
-        return detail::temp_storage::layout{get_storage_size(number_of_blocks),
-                                            alignof(prefix_underlying_type)};
+        size_t     storage_size = 0;
+        hipError_t error        = get_storage_size(number_of_blocks, stream, storage_size);
+        layout = detail::temp_storage::layout{storage_size, alignof(prefix_underlying_type)};
+        return error;
     }
 
     ROCPRIM_DEVICE ROCPRIM_INLINE
@@ -238,11 +251,15 @@ struct lookback_scan_state<T, UseSleep, false>
     using value_type = T;
 
     // temp_storage must point to allocation of get_storage_size(number_of_blocks) bytes
-    ROCPRIM_HOST static inline
-    lookback_scan_state create(void* temp_storage, const unsigned int number_of_blocks)
+    ROCPRIM_HOST static inline hipError_t create(lookback_scan_state& state,
+                                                 void*                temp_storage,
+                                                 const unsigned int   number_of_blocks,
+                                                 const hipStream_t    stream)
     {
-        const auto n = ::rocprim::host_warp_size() + number_of_blocks;
-        lookback_scan_state state;
+        unsigned int warp_size;
+        hipError_t   error = ::rocprim::host_warp_size(stream, warp_size);
+
+        const auto n = warp_size + number_of_blocks;
 
         auto ptr = static_cast<char*>(temp_storage);
 
@@ -253,23 +270,31 @@ struct lookback_scan_state<T, UseSleep, false>
         ptr += ::rocprim::detail::align_size(n * sizeof(T));
 
         state.prefixes_complete_values = reinterpret_cast<T*>(ptr);
-        return state;
+        return error;
     }
 
-    ROCPRIM_HOST static inline
-    size_t get_storage_size(const unsigned int number_of_blocks)
+    ROCPRIM_HOST static inline hipError_t get_storage_size(const unsigned int number_of_blocks,
+                                                           const hipStream_t  stream,
+                                                           size_t&            storage_size)
     {
-        const auto n = ::rocprim::host_warp_size() + number_of_blocks;
-        size_t size = ::rocprim::detail::align_size(n * sizeof(flag_type));
-        size += 2 * ::rocprim::detail::align_size(n * sizeof(T));
-        return size;
+        unsigned int warp_size;
+        hipError_t   error = ::rocprim::host_warp_size(stream, warp_size);
+        const auto   n     = warp_size + number_of_blocks;
+        storage_size       = ::rocprim::detail::align_size(n * sizeof(flag_type));
+        storage_size += 2 * ::rocprim::detail::align_size(n * sizeof(T));
+        return error;
     }
 
-    ROCPRIM_HOST static inline detail::temp_storage::layout
-        get_temp_storage_layout(const unsigned int number_of_blocks)
+    ROCPRIM_HOST static inline hipError_t
+        get_temp_storage_layout(const unsigned int            number_of_blocks,
+                                const hipStream_t             stream,
+                                detail::temp_storage::layout& layout)
     {
+        size_t     storage_size = 0;
         size_t alignment = std::max(alignof(flag_type), alignof(T));
-        return detail::temp_storage::layout{get_storage_size(number_of_blocks), alignment};
+        hipError_t error        = get_storage_size(number_of_blocks, stream, storage_size);
+        layout                  = detail::temp_storage::layout{storage_size, alignment};
+        return error;
     }
 
     ROCPRIM_DEVICE ROCPRIM_INLINE
diff --git a/rocprim/include/rocprim/device/device_adjacent_difference.hpp b/rocprim/include/rocprim/device/device_adjacent_difference.hpp
index 09c418504..917bf4328 100644
--- a/rocprim/include/rocprim/device/device_adjacent_difference.hpp
+++ b/rocprim/include/rocprim/device/device_adjacent_difference.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -71,22 +71,28 @@ BEGIN_ROCPRIM_NAMESPACE
 
 namespace detail
 {
-template <typename Config,
-          bool InPlace,
-          bool Right,
-          typename InputIt,
-          typename OutputIt,
-          typename BinaryFunction>
-void ROCPRIM_KERNEL __launch_bounds__(Config::block_size) adjacent_difference_kernel(
-    const InputIt                                             input,
-    const OutputIt                                            output,
-    const std::size_t                                         size,
-    const BinaryFunction                                      op,
-    const typename std::iterator_traits<InputIt>::value_type* previous_values,
-    const std::size_t                                         starting_block)
+template<typename Config,
+         bool InPlace,
+         bool Right,
+         typename InputIt,
+         typename OutputIt,
+         typename BinaryFunction>
+void ROCPRIM_KERNEL
+    __launch_bounds__(device_params<Config>().adjacent_difference_kernel_config.block_size)
+        adjacent_difference_kernel(
+            const InputIt                                             input,
+            const OutputIt                                            output,
+            const std::size_t                                         size,
+            const BinaryFunction                                      op,
+            const typename std::iterator_traits<InputIt>::value_type* previous_values,
+            const std::size_t                                         starting_block)
 {
-    adjacent_difference_kernel_impl<Config, InPlace, Right>(
-        input, output, size, op, previous_values, starting_block);
+    adjacent_difference_kernel_impl<Config, InPlace, Right>(input,
+                                                            output,
+                                                            size,
+                                                            op,
+                                                            previous_values,
+                                                            starting_block);
 }
 
 template <typename Config,
@@ -106,16 +112,23 @@ hipError_t adjacent_difference_impl(void* const          temporary_storage,
 {
     using value_type = typename std::iterator_traits<InputIt>::value_type;
 
-    using config = detail::default_or_custom_config<
-        Config,
-        detail::default_adjacent_difference_config<ROCPRIM_TARGET_ARCH, value_type>>;
+    using config = wrapped_adjacent_difference_config<Config, InPlace, value_type>;
+
+    detail::target_arch target_arch;
+    hipError_t          result = detail::host_target_arch(stream, target_arch);
+    if(result != hipSuccess)
+    {
+        return result;
+    }
 
-    static constexpr unsigned int block_size       = config::block_size;
-    static constexpr unsigned int items_per_thread = config::items_per_thread;
-    static constexpr unsigned int items_per_block  = block_size * items_per_thread;
+    const detail::adjacent_difference_config_params params
+        = detail::dispatch_target_arch<config>(target_arch);
 
-    const std::size_t num_blocks = ceiling_div(size, items_per_block);
-    const std::size_t num_previous_values = InPlace && num_blocks >= 2 ? num_blocks - 1 : 0;
+    const unsigned int block_size       = params.adjacent_difference_kernel_config.block_size;
+    const unsigned int items_per_thread = params.adjacent_difference_kernel_config.items_per_thread;
+    const unsigned int items_per_block  = block_size * items_per_thread;
+    const std::size_t  num_blocks       = ceiling_div(size, items_per_block);
+    const std::size_t  num_previous_values = InPlace && num_blocks >= 2 ? num_blocks - 1 : 0;
 
     value_type* previous_values;
 
@@ -139,11 +152,11 @@ hipError_t adjacent_difference_impl(void* const          temporary_storage,
     {
         // If doing left adjacent diff then the last item of each block is needed for the
         // next block, otherwise the first item is needed for the previous block
-        static constexpr auto offset = items_per_block - (Right ? 0 : 1);
+        const auto offset = items_per_block - (Right ? 0 : 1);
 
         const auto block_starts_iter = make_transform_iterator(
-            rocprim::make_counting_iterator(std::size_t {0}),
-            [base = input + offset](std::size_t i) { return base[i * items_per_block]; });
+            rocprim::make_counting_iterator(std::size_t{0}),
+            [=, base = input + offset](std::size_t i) { return base[i * items_per_block]; });
 
         const hipError_t error = ::rocprim::transform(block_starts_iter,
                                                       previous_values,
@@ -157,9 +170,9 @@ hipError_t adjacent_difference_impl(void* const          temporary_storage,
         }
     }
 
-    static constexpr unsigned int size_limit     = config::size_limit;
-    static constexpr auto number_of_blocks_limit = std::max(size_limit / items_per_block, 1u);
-    static constexpr auto aligned_size_limit     = number_of_blocks_limit * items_per_block;
+    const unsigned int size_limit             = params.adjacent_difference_kernel_config.size_limit;
+    const auto         number_of_blocks_limit = std::max(size_limit / items_per_block, 1u);
+    const auto         aligned_size_limit     = number_of_blocks_limit * items_per_block;
 
     // Launch number_of_blocks_limit blocks while there is still at least as many blocks
     // left as the limit
@@ -210,7 +223,7 @@ hipError_t adjacent_difference_impl(void* const          temporary_storage,
 }
 } // namespace detail
 
-#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
+    #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
 
 #endif // DOXYGEN_SHOULD_SKIP_THIS
 
@@ -231,8 +244,8 @@ hipError_t adjacent_difference_impl(void* const          temporary_storage,
 /// }
 /// \endcode
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be
-/// `adjacent_difference_config` or a class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be
+/// `adjacent_difference_config` or a class derived from it.
 /// \tparam InputIt - [inferred] random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam OutputIt - [inferred] random-access iterator type of the output range. Must meet the
@@ -327,8 +340,8 @@ hipError_t adjacent_difference(void* const          temporary_storage,
 /// }
 /// \endcode
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be
-/// `adjacent_difference_config` or a class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be
+/// `adjacent_difference_config` or a class derived from it.
 /// \tparam InputIt - [inferred] random-access iterator type of the value range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam BinaryFunction - [inferred] binary operation function object that will be applied to
@@ -380,8 +393,8 @@ hipError_t adjacent_difference_inplace(void* const          temporary_storage,
 /// }
 /// \endcode
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be
-/// `adjacent_difference_config` or a class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be
+/// `adjacent_difference_config` or a class derived from it.
 /// \tparam InputIt - [inferred] random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam OutputIt - [inferred] random-access iterator type of the output range. Must meet the
@@ -476,8 +489,8 @@ hipError_t adjacent_difference_right(void* const          temporary_storage,
 /// }
 /// \endcode
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be
-/// `adjacent_difference_config` or a class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be
+/// `adjacent_difference_config` or a class derived from it.
 /// \tparam InputIt - [inferred] random-access iterator type of the value range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam BinaryFunction - [inferred] binary operation function object that will be applied to
diff --git a/rocprim/include/rocprim/device/device_adjacent_difference_config.hpp b/rocprim/include/rocprim/device/device_adjacent_difference_config.hpp
index 804e7d20b..0299484f1 100644
--- a/rocprim/include/rocprim/device/device_adjacent_difference_config.hpp
+++ b/rocprim/include/rocprim/device/device_adjacent_difference_config.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -28,6 +28,9 @@
 #include "../functional.hpp"
 
 #include "config_types.hpp"
+#include "detail/config/device_adjacent_difference.hpp"
+#include "detail/config/device_adjacent_difference_inplace.hpp"
+#include "detail/device_config_helper.hpp"
 
 #include "../block/block_load.hpp"
 #include "../block/block_store.hpp"
@@ -37,44 +40,62 @@
 
 BEGIN_ROCPRIM_NAMESPACE
 
-/// \brief Configuration of device-level adjacent_difference primitives.
-///
-/// \tparam BlockSize - number of threads in a block.
-/// \tparam ItemsPerThread - number of items processed by each thread
-/// \tparam LoadMethod - method for loading input values
-/// \tparam StoreMethod - method for storing values
-/// \tparam SizeLimit - limit on the number of items for a single adjacent_difference kernel launch.
-/// Larger input sizes will be broken up to multiple kernel launches.
-template <unsigned int       BlockSize,
-          unsigned int       ItemsPerThread,
-          block_load_method  LoadMethod  = block_load_method::block_load_transpose,
-          block_store_method StoreMethod = block_store_method::block_store_transpose,
-          unsigned int       SizeLimit   = ROCPRIM_GRID_SIZE_LIMIT>
-struct adjacent_difference_config : kernel_config<BlockSize, ItemsPerThread, SizeLimit>
-{
-    static constexpr block_load_method  load_method  = LoadMethod; ///< input values are loaded using this method
-    static constexpr block_store_method store_method = StoreMethod; ///< input values are stored using this method
-};
-
 namespace detail
 {
 
-template <class Value>
-struct adjacent_difference_config_fallback
+// Specialization for user provided configuration
+template<typename AdjacentDifferenceConfig, bool InPlace, typename>
+struct wrapped_adjacent_difference_config
 {
-    static constexpr unsigned int item_scale
-        = ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
+    static_assert(
+        std::is_same<typename AdjacentDifferenceConfig::tag, adjacent_difference_config_tag>::value,
+        "Config must be a specialization of struct template adjacent_difference_config");
+
+    template<target_arch Arch>
+    struct architecture_config
+    {
+        static constexpr adjacent_difference_config_params params = AdjacentDifferenceConfig{};
+    };
+};
 
-    using type = adjacent_difference_config<256, ::rocprim::max(1u, 16u / item_scale)>;
+// Specialization for selecting the default configuration for in place
+template<typename Value>
+struct wrapped_adjacent_difference_config<default_config, true, Value>
+{
+    template<target_arch Arch>
+    struct architecture_config
+    {
+        static constexpr adjacent_difference_config_params params
+            = default_adjacent_difference_inplace_config<static_cast<unsigned int>(Arch), Value>{};
+    };
 };
 
-template <unsigned int TargetArch, class Value>
-struct default_adjacent_difference_config
-    : select_arch<TargetArch, adjacent_difference_config_fallback<Value>>
+// Specialization for selecting the default configuration for out of place
+template<typename Value>
+struct wrapped_adjacent_difference_config<default_config, false, Value>
 {
+    template<target_arch Arch>
+    struct architecture_config
+    {
+        static constexpr adjacent_difference_config_params params
+            = default_adjacent_difference_config<static_cast<unsigned int>(Arch), Value>{};
+    };
 };
 
-} // end namespace detail
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+template<class Value>
+template<target_arch Arch>
+constexpr adjacent_difference_config_params
+    wrapped_adjacent_difference_config<rocprim::default_config, true, Value>::architecture_config<
+        Arch>::params;
+template<class Value>
+template<target_arch Arch>
+constexpr adjacent_difference_config_params
+    wrapped_adjacent_difference_config<rocprim::default_config, false, Value>::architecture_config<
+        Arch>::params;
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+} // namespace detail
 
 END_ROCPRIM_NAMESPACE
 
diff --git a/rocprim/include/rocprim/device/device_binary_search.hpp b/rocprim/include/rocprim/device/device_binary_search.hpp
index 2f6a0146d..395f5ab19 100644
--- a/rocprim/include/rocprim/device/device_binary_search.hpp
+++ b/rocprim/include/rocprim/device/device_binary_search.hpp
@@ -31,11 +31,11 @@
 #include "device_binary_search_config.hpp"
 #include "device_transform.hpp"
 
-BEGIN_ROCPRIM_NAMESPACE
-
 /// \addtogroup devicemodule
 /// @{
 
+BEGIN_ROCPRIM_NAMESPACE
+
 namespace detail
 {
 
@@ -83,45 +83,117 @@ hipError_t binary_search(void * temporary_storage,
     );
 }
 
+template<class Config, class Tag>
+struct is_default_or_has_tag
+{
+    static constexpr bool value
+        = std::integral_constant<bool, std::is_same<typename Config::tag, Tag>::value>::value;
+};
+
+template<class Tag>
+struct is_default_or_has_tag<default_config, Tag>
+{
+    static constexpr bool value = true;
+};
+
 } // end of detail namespace
 
-/// \brief Performs a device-level lower bound check.
+/// \brief Parallel primitive that uses binary search for computing a lower bound on a given ordered
+/// range for each element of a given input.
+///
+/// The `lower_bound` function determines for each element `e` of a given input the greatest index
+/// `i` in a given ordered range `haystack` such that `!compare_op(e, haystack[i])` is
+/// `true.`
+/// It uses the search function `detail::lower_bound_search_op,` which in turn uses a binary
+/// operator `compare_op` for comparing the given value with the haystack ones.
 ///
 /// \par Overview
-/// Runs multiple lower bound checks in parallel (one for each \p needle in <tt>needles</tt>).
-/// A lower bound check returns the index of the first element in \p haystack that
-/// causes \p compare_op(element,needle) to return false. If no item in \p haystack satisfies
-/// this criteria, then \p haystack_size is returned.
-/// Results are written by \p output.
+/// * When a null pointer is passed as `temporary_storage,` the required allocation size (in bytes)
+/// is written to `storage_size` and the function returns without performing the search operation.
+/// * If used along with `rocprim::upper_bound,` the ith element of the given input must be located
+/// in the semi-open interval `[lower_output[i], upper_output[i])` of `haystack,` in case of
+/// being present at all.
 ///
-/// \tparam Config - [optional] configuration information for the primitive. This can be 
-/// \p lower_bound_config or a custom class with the same members.
-/// \tparam HaystackIterator - Iterator type for items we'll be searching through (values).
-/// \tparam NeedlesIterator - Iterator type for items we are performing lower bound checks
-/// for (keys).
-/// \tparam OutputIterator - Iterator type for the output indices.
-/// \tparam CompareFunction [optional] A callable that can be used to compare two values.
-/// defaults to rocprim::less.
+///  \tparam Config - [optional] Configuration of the primitive. It has to be `lower_bound_config` or
+/// a class derived from it. Default is `default_config.`
+///  \tparam HaystackIterator - [inferred] Random-access iterator type of the search range. Must meet
+/// the requirements of a C++ InputIterator concept. It can be a simple pointer type.
+///  \tparam NeedlesIterator - [inferred] Random-access iterator type of the input range. Must meet
+/// the requirements of a C++ InputIterator concept. It can be a simple pointer type. Elements of
+/// the type pointed by it must be comparable to elements of the type pointed by HaystackIterator
+/// as either operand of `compare_op.`
+///  \tparam OutputIterator - [inferred] Random-access iterator type of the output range. Must meet
+/// the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///  \tparam CompareFunction - [inferred] Type of binary function that accepts two arguments of the
+/// types pointed by `HaystackIterator` and `NeedlesIterator,` and returns a value convertible
+/// to bool. Default type is `::rocprim::less<>.`
+/// \param [in] temporary_storage - Pointer to a device-accessible temporary storage.
+/// \param [in,out] storage_size - Reference to the size (in bytes) of `temporary_storage.`
+/// \param [in] haystack - Iterator to the first element in the search range. Elements of this
+/// range must be sorted.
+/// \param [in] needles - Iterator to the first element in the range of values to search for on
+/// `haystack.`
+/// \param [out] output - Iterator to the first element in the output range.
+/// \param [in] haystack_size - Number of elements in the search range `haystack.`
+/// \param [in] needles_size - Number of elements in the input range `needles.`
+/// \param [in] compare_op - Binary operation function object that is used to compare values. The
+/// signature of the function should be equivalent to the following:
+/// `bool f(const T &a, const U &b);`. It does not need to have `const &`, but the
+/// function object must not modify the objects passed to it. Default is `CompareFunction().`
+/// \param [in] stream - [optional] HIP stream object. Default is `0` (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel launch is
+/// forced in order to check for errors.
+/// \return `hipSuccess` (`0)` after a successful search; otherwise a HIP runtime error of
+/// type `hipError_t.`
 ///
-/// \param [in] temporary_storage - pointer to device-accessible temporary storage. When
-/// a null pointer is passed, the required allocation size (in bytes) is written to
-/// \p storage_size and the function returns without performing the search operation.
-/// \param [in,out] storage_size - reference to the size (in bytes) of \p temporary_storage.
-/// \param haystack [in] - iterator pointing to the beginning of the range to search through.
-/// \param needles [in] - iterator pointing to the first of the elements to perform lower
-/// bound checks on.
-/// \param output [out] - Iterator pointing to the beginning of the range where the results
-/// are to be stored.
-/// \param haystack_size [in] - the total number of values to search through.
-/// \param needles_size [in] - the total number of keys to perform lower bound checks for.
-/// \param compare_op [in] - binary operation function that will be used for comparison.
-/// The signature of the function should be equivalent to the following:
-/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
-/// <tt>const &</tt>, but the function object must not modify the objects passed to it.
-/// The default value is \p CompareFunction().
-/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
-/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
-/// launch is forced in order to check for errors. Default value is \p false.
+/// \par Example
+/// \parblock
+/// In this example a device-level lower bound computation on a haystack of double precision type
+/// values is performed on an input array of integer values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.).
+/// size_t          haystack_size;    // e.g. 7
+/// double *        haystack;         // e.g. {0, 1.5, 3, 4.5, 6, 7.5, 9}
+/// size_t          needles_size;     // e.g. 5
+/// int *           needles;          // e.g. {1, 2, 3, 4, 5}
+/// compare_op_type compare_op;       // e.g. compare_op_type = rocprim::less<>
+/// size_t *        output;           // empty array of needles_size elements
+///
+/// // Get required size of the temporary storage.
+/// void * temporary_storage = nullptr;
+/// size_t temporary_storage_bytes;
+/// rocprim::lower_bound<config>(temporary_storage,
+///                              temporary_storage_bytes,
+///                              haystack,
+///                              needles,
+///                              output,
+///                              haystack_size,
+///                              needles_size,
+///                              compare_op,
+///                              stream,
+///                              debug_synchronous);
+///
+/// // Allocate temporary storage.
+/// hipMalloc(&temporary_storage, temporary_storage_bytes);
+///
+/// // Perform binary search.
+/// rocprim::lower_bound<config>(temporary_storage,
+///                              temporary_storage_bytes,
+///                              haystack,
+///                              needles,
+///                              output,
+///                              haystack_size,
+///                              needles_size,
+///                              compare_op,
+///                              stream,
+///                              debug_synchronous);
+///
+/// // output = {0, 1, 2, 2, 3}
+/// \endcode
+/// \endparblock
 template<
     class Config = default_config,
     class HaystackIterator,
@@ -141,6 +213,9 @@ hipError_t lower_bound(void * temporary_storage,
                        hipStream_t stream = 0,
                        bool debug_synchronous = false)
 {
+    static_assert(detail::is_default_or_has_tag<Config, detail::lower_bound_config_tag>::value,
+                  "Config must be a specialization of struct template lower_bound_config");
+
     using value_type  = typename std::iterator_traits<NeedlesIterator>::value_type;
     using output_type = typename std::iterator_traits<OutputIterator>::value_type;
     using config
@@ -161,43 +236,102 @@ hipError_t lower_bound(void * temporary_storage,
                                          debug_synchronous);
 }
 
-/// \brief Performs a device-level upper bound check.
+/// \brief Parallel primitive that uses binary search for computing an upper bound on a given ordered
+/// range for each element of a given input.
+///
+/// The `upper_bound` function determines for each element `e` of a given input the lowest index
+/// `i` in a given ordered range `haystack` such that `compare_op(e, haystack[i])` is
+/// `true.`
+/// It uses the search function `detail::upper_bound_search_op,` which in turn uses a binary
+/// operator `compare_op` for comparing the input values with the haystack ones.
 ///
 /// \par Overview
-/// Runs multiple upper bound checks in parallel (one for each \p needle in <tt>needles</tt>).
-/// An upper bound check returns the index of the first element in \p haystack that
-/// causes \p compare_op(needle,element) to return true. If no item in \p haystack satisfies
-/// this criteria, then \p haystack_size is returned.
-/// Results are written by \p output.
+/// * When a null pointer is passed as `temporary_storage,` the required allocation size (in bytes)
+/// is written to `storage_size` and the function returns without performing the search operation.
+/// * If used along with `rocprim::lower_bound,` the ith element of the given input must be located
+/// in the semi-open interval `[lower_output[i], upper_output[i])` of `haystack,` in case of
+/// being present at all.
 ///
-/// \tparam Config - [optional] configuration information for the primitive. This can be 
-/// \p upper_bound_config or a custom class with the same members.
-/// \tparam HaystackIterator - Iterator type for items we'll be searching through (values).
-/// \tparam NeedlesIterator - Iterator type for items we are performing upper bound checks
-/// for (keys).
-/// \tparam OutputIterator - Iterator type for the output indices.
-/// \tparam CompareFunction [optional] A callable that can be used to compare two values.
-/// defaults to rocprim::less.
+///  \tparam Config - [optional] Configuration of the primitive. It can be `upper_bound_config` or
+/// a class derived from it. Default is `default_config.`
+///  \tparam HaystackIterator - [inferred] Random-access iterator type of the search range. Must meet
+/// the requirements of a C++ InputIterator concept. It can be a simple pointer type.
+///  \tparam NeedlesIterator - [inferred] Random-access iterator type of the input range. Must meet
+/// the requirements of a C++ InputIterator concept. It can be a simple pointer type. Elements of
+/// the type pointed by it must be comparable to elements of the type pointed by HaystackIterator
+/// as either operand of `compare_op.`
+///  \tparam OutputIterator - [inferred] Random-access iterator type of the output range. Must meet
+/// the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///  \tparam CompareFunction - [inferred] Type of binary function that accepts two arguments of the
+/// types pointed by `HaystackIterator` and `NeedlesIterator,` and returns a value convertible
+/// to bool. Default type is `::rocprim::less<>.`
+/// \param [in] temporary_storage - Pointer to a device-accessible temporary storage.
+/// \param [in,out] storage_size - Reference to the size (in bytes) of `temporary_storage.`
+/// \param [in] haystack - Iterator to the first element in the search range. Elements of this
+/// range must be sorted.
+/// \param [in] needles - Iterator to the first element in the range of values to search for on
+/// `haystack.`
+/// \param [out] output - Iterator to the first element in the output range.
+/// \param [in] haystack_size - Number of elements in the search range `haystack.`
+/// \param [in] needles_size - Number of elements in the input range `needles.`
+/// \param [in] compare_op - Binary operation function object that is used to compare values. The
+/// signature of the function should be equivalent to the following:
+/// `bool f(const T &a, const U &b);`. It does not need to have `const &`, but the
+/// function object must not modify the objects passed to it. Default is `CompareFunction().`
+/// \param [in] stream - [optional] HIP stream object. Default is `0` (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel launch is
+/// forced in order to check for errors.
+/// \return `hipSuccess` (`0)` after a successful search; otherwise a HIP runtime error of
+/// type `hipError_t.`
 ///
-/// \param [in] temporary_storage - pointer to device-accessible temporary storage. When
-/// a null pointer is passed, the required allocation size (in bytes) is written to
-/// \p storage_size and the function returns without performing the search operation.
-/// \param [in,out] storage_size - reference to the size (in bytes) of \p temporary_storage.
-/// \param haystack [in] - iterator pointing to the beginning of the range to search through.
-/// \param needles [in] - iterator pointing to the first of the elements to perform upper
-/// bound checks on.
-/// \param output [out] - Iterator pointing to the beginning of the range where the results
-/// are to be stored.
-/// \param haystack_size [in] - the total number of values to search through.
-/// \param needles_size [in] - the total number of keys to perform upper bound checks for.
-/// \param compare_op [in] - binary operation function that will be used for comparison.
-/// The signature of the function should be equivalent to the following:
-/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
-/// <tt>const &</tt>, but the function object must not modify the objects passed to it.
-/// The default value is \p CompareFunction().
-/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
-/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
-/// launch is forced in order to check for errors. Default value is \p false.
+/// \par Example
+/// \parblock
+/// In this example a device-level upper bound computation on a haystack of double precision type
+/// values is performed on an input array of integer values.
+///
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
+///
+/// // Prepare input and output (declare pointers, allocate device memory etc.).
+/// size_t          haystack_size;    // e.g. 7
+/// double *        haystack;         // e.g. {0, 1.5, 3, 4.5, 6, 7.5, 9}
+/// size_t          needles_size;     // e.g. 5
+/// int *           needles;          // e.g. {1, 2, 3, 4, 5}
+/// compare_op_type compare_op;       // e.g. compare_op_type = rocprim::less<>
+/// size_t *        output;           // empty array of needles_size elements
+///
+/// // Get required size of the temporary storage.
+/// void * temporary_storage = nullptr;
+/// size_t temporary_storage_bytes;
+/// rocprim::upper_bound<config>(temporary_storage,
+///                              temporary_storage_bytes,
+///                              haystack,
+///                              needles,
+///                              output,
+///                              haystack_size,
+///                              needles_size,
+///                              compare_op,
+///                              stream,
+///                              debug_synchronous);
+///
+/// // Allocate temporary storage.
+/// hipMalloc(&temporary_storage, temporary_storage_bytes);
+///
+/// // Perform binary search.
+/// rocprim::upper_bound<config>(temporary_storage,
+///                              temporary_storage_bytes,
+///                              haystack,
+///                              needles,
+///                              output,
+///                              haystack_size,
+///                              needles_size,
+///                              compare_op,
+///                              stream,
+///                              debug_synchronous);
+///
+/// // output = {1, 2, 3, 3, 4}
+/// \endcode
+/// \endparblock
 template<
     class Config = default_config,
     class HaystackIterator,
@@ -217,6 +351,8 @@ hipError_t upper_bound(void * temporary_storage,
                        hipStream_t stream = 0,
                        bool debug_synchronous = false)
 {
+    static_assert(detail::is_default_or_has_tag<Config, detail::upper_bound_config_tag>::value,
+                  "Config must be a specialization of struct template upper_bound_config");
     using value_type  = typename std::iterator_traits<NeedlesIterator>::value_type;
     using output_type = typename std::iterator_traits<OutputIterator>::value_type;
     using config
@@ -237,39 +373,97 @@ hipError_t upper_bound(void * temporary_storage,
                                          debug_synchronous);
 }
 
-/// \brief Performs a device-level parallel binary search.
+/// \brief Parallel primitive for performing a binary search (on a sorted range) of a given input.
+///
+/// The `binary_search` function determines for each element of a given input if it's present
+/// in a given ordered range `haystack`. It uses the search function `detail::binary_search_op`
+/// which in turn uses a binary operator `compare_op` for comparing the input values with the
+/// haystack ones.
 ///
 /// \par Overview
-/// Runs multiple binary searches in parallel. The result is a sequence of bools,
-/// where each bool indicates if the corresponding search succeeded (the key was found)
-/// or not. Results are written by \p output.
+/// * When a null pointer is passed as `temporary_storage`, the required allocation size (in bytes)
+/// is written to `storage_size` and the function returns without performing the search operation.
+///
+///  \tparam Config - [optional] Configuration of the primitive. It can be `binary_search_config` or
+/// a class derived from it. Default is `default_config`.
+///  \tparam HaystackIterator - [inferred] Random-access iterator type of the search range. Must meet
+/// the requirements of a C++ InputIterator concept. It can be a simple pointer type.
+///  \tparam NeedlesIterator - [inferred] Random-access iterator type of the input range. Must meet
+/// the requirements of a C++ InputIterator concept. It can be a simple pointer type. Elements of
+/// the type pointed by it must be comparable to elements of the type pointed by `HaystackIterator`
+/// as either operand of `compare_op`.
+///  \tparam OutputIterator - [inferred] Random-access iterator type of the output range. Must meet
+/// the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
+///  \tparam CompareFunction - [inferred] Type of binary function that accepts two arguments of the
+/// types pointed by `HaystackIterator` and `NeedlesIterator`, and returns a value convertible to
+/// bool. Default type is `::rocprim::less<>`.
+/// \param [in] temporary_storage - Pointer to a device-accessible temporary storage.
+/// \param [in,out] storage_size - Reference to the size (in bytes) of `temporary_storage`.
+/// \param [in] haystack - Iterator to the first element in the search range. Elements of this
+/// range must be sorted.
+/// \param [in] needles - Iterator to the first element in the range of values to search for on
+/// `haystack`.
+/// \param [out] output - Iterator to the first element in the output range of boolean values.
+/// \param [in] haystack_size - Number of elements in the search range `haystack`.
+/// \param [in] needles_size - Number of elements in the input range `needles`.
+/// \param [in] compare_op - Binary operation function object that is used to compare values. The
+/// signature of the function should be equivalent to the following:
+/// `bool f(const T &a, const U &b);`. It does not need to have `const &`, but the
+/// function object must not modify the objects passed to it. Default is `CompareFunction()`.
+/// \param [in] stream - [optional] HIP stream object. Default is `0` (default stream).
+/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel launch is
+/// forced in order to check for errors.
+/// \return `hipSuccess` (`0`) after a successful search; otherwise a HIP runtime error of
+/// type `hipError_t`.
+///
+/// \par Example
+/// \parblock
+/// In this example a device-level binary search on a haystack of integer values is performed on an
+/// input array of integer values too.
 ///
-/// \tparam Config - [optional] configuration information for the primitive. This can be 
-/// \p binary_search_config or a custom class with the same members.
-/// \tparam HaystackIterator - Iterator type for items we'll be searching through (values).
-/// \tparam NeedlesIterator - Iterator type for item we are searching for (keys).
-/// \tparam OutputIterator - Iterator type for the output bools.
-/// \tparam CompareFunction [optional] A callable that can be used to compare two values.
-/// defaults to rocprim::less.
+/// \code{.cpp}
+/// #include <rocprim/rocprim.hpp>
 ///
-/// \param [in] temporary_storage - pointer to device-accessible temporary storage. When
-/// a null pointer is passed, the required allocation size (in bytes) is written to
-/// \p storage_size and the function returns without performing the search operation.
-/// \param [in,out] storage_size - reference to the size (in bytes) of \p temporary_storage.
-/// \param haystack [in] - iterator pointing to the beginning of the range to search through.
-/// \param needles [in] - iterator pointing to the first of the elements to find.
-/// \param output [out] - Iterator pointing to the beginning of the range where the results
-/// are to be stored.
-/// \param haystack_size [in] - the total number of values to search through.
-/// \param needles_size [in] - the total number of keys to search for.
-/// \param compare_op [in] - binary operation function that will be used for comparison.
-/// The signature of the function should be equivalent to the following:
-/// <tt>bool f(const T &a, const T &b);</tt>. The signature does not need to have
-/// <tt>const &</tt>, but the function object must not modify the objects passed to it.
-/// The default value is \p CompareFunction().
-/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream).
-/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel
-/// launch is forced in order to check for errors. Default value is \p false.
+/// // Prepare input and output (declare pointers, allocate device memory etc.).
+/// size_t          haystack_size;    // e.g. 10
+/// int *           haystack;         // e.g. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
+/// size_t          needles_size;     // e.g. 8
+/// int *           needles;          // e.g. {0, 2, 12, 4, 14, 6, 8, 10}
+/// compare_op_type compare_op;       // e.g. compare_op_type = rocprim::less<int>
+/// size_t *        output;           // empty array of needles_size elements
+///
+/// // Get required size of the temporary storage.
+/// void * temporary_storage = nullptr;
+/// size_t temporary_storage_bytes;
+/// rocprim::binary_search<config>(temporary_storage,
+///                                temporary_storage_bytes,
+///                                haystack,
+///                                needles,
+///                                output,
+///                                haystack_size,
+///                                needles_size,
+///                                compare_op,
+///                                stream,
+///                                debug_synchronous);
+///
+/// // Allocate temporary storage.
+/// hipMalloc(&temporary_storage, temporary_storage_bytes);
+///
+/// // Perform binary search.
+/// rocprim::binary_search<config>(temporary_storage,
+///                                temporary_storage_bytes,
+///                                haystack,
+///                                needles,
+///                                output,
+///                                haystack_size,
+///                                needles_size,
+///                                compare_op,
+///                                stream,
+///                                debug_synchronous);
+///
+/// // output = {1, 1, 0, 1, 0, 1, 1, 0}
+/// \endcode
+/// \endparblock
 template<
     class Config = default_config,
     class HaystackIterator,
@@ -289,6 +483,8 @@ hipError_t binary_search(void * temporary_storage,
                          hipStream_t stream = 0,
                          bool debug_synchronous = false)
 {
+    static_assert(detail::is_default_or_has_tag<Config, detail::binary_search_config_tag>::value,
+                  "Config must be a specialization of struct template binary_search_config");
     using value_type  = typename std::iterator_traits<NeedlesIterator>::value_type;
     using output_type = typename std::iterator_traits<OutputIterator>::value_type;
     using config
@@ -309,9 +505,9 @@ hipError_t binary_search(void * temporary_storage,
                                          debug_synchronous);
 }
 
+END_ROCPRIM_NAMESPACE
+
 /// @}
 // end of group devicemodule
 
-END_ROCPRIM_NAMESPACE
-
 #endif // ROCPRIM_DEVICE_DEVICE_BINARY_SEARCH_HPP_
diff --git a/rocprim/include/rocprim/device/device_binary_search_config.hpp b/rocprim/include/rocprim/device/device_binary_search_config.hpp
index bf8b2f75c..7b4a968a0 100644
--- a/rocprim/include/rocprim/device/device_binary_search_config.hpp
+++ b/rocprim/include/rocprim/device/device_binary_search_config.hpp
@@ -56,8 +56,8 @@ struct wrapped_transform_config<default_config_for_binary_search<Value, Output>,
     template<target_arch Arch>
     struct architecture_config
     {
-        static constexpr transform_config_params params = wrap_transform_config<
-            default_binary_search_config<static_cast<unsigned int>(Arch), Value, Output>>();
+        static constexpr transform_config_params params
+            = default_binary_search_config<static_cast<unsigned int>(Arch), Value, Output>{};
     };
 };
 
@@ -67,8 +67,8 @@ struct wrapped_transform_config<default_config_for_upper_bound<Value, Output>, U
     template<target_arch Arch>
     struct architecture_config
     {
-        static constexpr transform_config_params params = wrap_transform_config<
-            default_upper_bound_config<static_cast<unsigned int>(Arch), Value, Output>>();
+        static constexpr transform_config_params params
+            = default_upper_bound_config<static_cast<unsigned int>(Arch), Value, Output>{};
     };
 };
 
@@ -78,8 +78,8 @@ struct wrapped_transform_config<default_config_for_lower_bound<Value, Output>, U
     template<target_arch Arch>
     struct architecture_config
     {
-        static constexpr transform_config_params params = wrap_transform_config<
-            default_lower_bound_config<static_cast<unsigned int>(Arch), Value, Output>>();
+        static constexpr transform_config_params params
+            = default_lower_bound_config<static_cast<unsigned int>(Arch), Value, Output>{};
     };
 };
 
diff --git a/rocprim/include/rocprim/device/device_histogram.hpp b/rocprim/include/rocprim/device/device_histogram.hpp
index 47bd796c9..0e8ff3700 100644
--- a/rocprim/include/rocprim/device/device_histogram.hpp
+++ b/rocprim/include/rocprim/device/device_histogram.hpp
@@ -449,8 +449,7 @@ inline hipError_t histogram_range_impl(void*          temporary_storage,
 /// * Returns the required size of \p temporary_storage in \p storage_size
 /// if \p temporary_storage in a null pointer.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config
-/// (preferred) or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it.
 /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam Counter - integer type for histogram bin counters.
@@ -553,8 +552,7 @@ inline hipError_t histogram_even(void*          temporary_storage,
 /// * Returns the required size of \p temporary_storage in \p storage_size
 /// if \p temporary_storage in a null pointer.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config
-/// (preferred) or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it.
 /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam Counter - integer type for histogram bin counters.
@@ -665,8 +663,7 @@ inline hipError_t histogram_even(void*          temporary_storage,
 ///
 /// \tparam Channels - number of channels interleaved in the input samples.
 /// \tparam ActiveChannels - number of channels being used for computing histograms.
-/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config
-/// (preferred) or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it.
 /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam Counter - integer type for histogram bin counters.
@@ -778,8 +775,7 @@ inline hipError_t multi_histogram_even(void*          temporary_storage,
 ///
 /// \tparam Channels - number of channels interleaved in the input samples.
 /// \tparam ActiveChannels - number of channels being used for computing histograms.
-/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config
-/// (preferred) or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it.
 /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam Counter - integer type for histogram bin counters.
@@ -886,8 +882,7 @@ inline hipError_t multi_histogram_even(void*          temporary_storage,
 /// * Returns the required size of \p temporary_storage in \p storage_size
 /// if \p temporary_storage in a null pointer.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config
-/// (preferred) or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it.
 /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam Counter - integer type for histogram bin counters.
@@ -984,8 +979,7 @@ inline hipError_t histogram_range(void*          temporary_storage,
 /// * Returns the required size of \p temporary_storage in \p storage_size
 /// if \p temporary_storage in a null pointer.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config
-/// (preferred) or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it.
 /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam Counter - integer type for histogram bin counters.
@@ -1091,8 +1085,7 @@ inline hipError_t histogram_range(void*          temporary_storage,
 ///
 /// \tparam Channels - number of channels interleaved in the input samples.
 /// \tparam ActiveChannels - number of channels being used for computing histograms.
-/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config
-/// (preferred) or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it.
 /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam Counter - integer type for histogram bin counters.
@@ -1199,8 +1192,7 @@ inline hipError_t multi_histogram_range(void*          temporary_storage,
 ///
 /// \tparam Channels - number of channels interleaved in the input samples.
 /// \tparam ActiveChannels - number of channels being used for computing histograms.
-/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config
-/// (preferred) or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it.
 /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam Counter - integer type for histogram bin counters.
diff --git a/rocprim/include/rocprim/device/device_histogram_config.hpp b/rocprim/include/rocprim/device/device_histogram_config.hpp
index 5d27c174b..b631e2a4b 100644
--- a/rocprim/include/rocprim/device/device_histogram_config.hpp
+++ b/rocprim/include/rocprim/device/device_histogram_config.hpp
@@ -32,26 +32,16 @@ BEGIN_ROCPRIM_NAMESPACE
 namespace detail
 {
 
-template<typename HistogramConfig>
-constexpr histogram_config_params wrap_histogram_config()
-{
-    return histogram_config_params{
-        {HistogramConfig::histogram::block_size,
-         HistogramConfig::histogram::items_per_thread,
-         HistogramConfig::histogram::size_limit},
-        HistogramConfig::max_grid_size,
-        HistogramConfig::shared_impl_max_bins,
-        HistogramConfig::shared_impl_histograms
-    };
-}
-
 template<typename HistogramConfig, typename, unsigned int, unsigned int>
 struct wrapped_histogram_config
 {
+    static_assert(std::is_same<typename HistogramConfig::tag, histogram_config_tag>::value,
+                  "Config must be a specialization of struct template histogram_config");
+
     template<target_arch Arch>
     struct architecture_config
     {
-        static constexpr histogram_config_params params = wrap_histogram_config<HistogramConfig>();
+        static constexpr histogram_config_params params = HistogramConfig{};
     };
 };
 
@@ -62,10 +52,10 @@ struct wrapped_histogram_config<default_config, Sample, Channels, ActiveChannels
     struct architecture_config
     {
         static constexpr histogram_config_params params
-            = wrap_histogram_config<default_histogram_config<static_cast<unsigned int>(Arch),
-                                                             Sample,
-                                                             Channels,
-                                                             ActiveChannels>>();
+            = default_histogram_config<static_cast<unsigned int>(Arch),
+                                       Sample,
+                                       Channels,
+                                       ActiveChannels>{};
     };
 };
 
diff --git a/rocprim/include/rocprim/device/device_merge.hpp b/rocprim/include/rocprim/device/device_merge.hpp
index 9a6fc3d12..30ef30e31 100644
--- a/rocprim/include/rocprim/device/device_merge.hpp
+++ b/rocprim/include/rocprim/device/device_merge.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -216,8 +216,7 @@ hipError_t merge_impl(void * temporary_storage,
 /// if \p temporary_storage in a null pointer.
 /// * Accepts custom compare_functions for merging across the device.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p merge_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p merge_config or a class derived from it.
 /// \tparam InputIterator1 - random-access iterator type of the first input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam InputIterator2 - random-access iterator type of the second input range. Must meet the
@@ -321,8 +320,7 @@ hipError_t merge(void * temporary_storage,
 /// if \p temporary_storage in a null pointer.
 /// * Accepts custom compare_functions for merging across the device.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p merge_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p merge_config or a class derived from it.
 /// \tparam KeysInputIterator1 - random-access iterator type of the first keys input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam KeysInputIterator2 - random-access iterator type of the second keys input range. Must meet the
diff --git a/rocprim/include/rocprim/device/device_merge_sort.hpp b/rocprim/include/rocprim/device/device_merge_sort.hpp
index f4d8365fc..6eb5b1808 100644
--- a/rocprim/include/rocprim/device/device_merge_sort.hpp
+++ b/rocprim/include/rocprim/device/device_merge_sort.hpp
@@ -61,13 +61,12 @@ ROCPRIM_KERNEL
 {
     static constexpr merge_sort_block_sort_config_params params = device_params<Config>();
     block_sort_kernel_impl<params.block_sort_config.block_size,
-                           params.block_sort_config.items_per_thread,
-                           params.block_sort_method>(keys_input,
-                                                     keys_output,
-                                                     values_input,
-                                                     values_output,
-                                                     sorted_block_size,
-                                                     compare_function);
+                           params.block_sort_config.items_per_thread>(keys_input,
+                                                                      keys_output,
+                                                                      values_input,
+                                                                      values_output,
+                                                                      sorted_block_size,
+                                                                      compare_function);
 }
 
 template<class Config,
diff --git a/rocprim/include/rocprim/device/device_partition.hpp b/rocprim/include/rocprim/device/device_partition.hpp
index d133f9e80..3b89a3ed1 100644
--- a/rocprim/include/rocprim/device/device_partition.hpp
+++ b/rocprim/include/rocprim/device/device_partition.hpp
@@ -178,14 +178,20 @@ hipError_t partition_impl(void * temporary_storage,
     size_t*                         selected_count;
     size_t*                         prev_selected_count;
 
+    detail::temp_storage::layout layout{};
+    const hipError_t             layout_result
+        = offset_scan_state_type::get_temp_storage_layout(number_of_blocks, stream, layout);
+    if(layout_result != hipSuccess)
+    {
+        return layout_result;
+    }
+
     const hipError_t partition_result = detail::temp_storage::partition(
         temporary_storage,
         storage_size,
         detail::temp_storage::make_linear_partition(
             // This is valid even with offset_scan_state_with_sleep_type
-            detail::temp_storage::make_partition(
-                &offset_scan_state_storage,
-                offset_scan_state_type::get_temp_storage_layout(number_of_blocks)),
+            detail::temp_storage::make_partition(&offset_scan_state_storage, layout),
             // Note: the following two are to be allocated continuously, so that they can be initialized
             // simultaneously.
             // They have the same base type, so there is no padding between the types.
@@ -200,10 +206,21 @@ hipError_t partition_impl(void * temporary_storage,
     std::chrono::high_resolution_clock::time_point start;
 
     // Create and initialize lookback_scan_state obj
-    auto offset_scan_state
-        = offset_scan_state_type::create(offset_scan_state_storage, number_of_blocks);
-    auto offset_scan_state_with_sleep
-        = offset_scan_state_with_sleep_type::create(offset_scan_state_storage, number_of_blocks);
+    offset_scan_state_type            offset_scan_state{};
+    hipError_t                        result = offset_scan_state_type::create(offset_scan_state,
+                                                       offset_scan_state_storage,
+                                                       number_of_blocks,
+                                                       stream);
+    offset_scan_state_with_sleep_type offset_scan_state_with_sleep{};
+    result = offset_scan_state_with_sleep_type::create(offset_scan_state_with_sleep,
+                                                       offset_scan_state_storage,
+                                                       number_of_blocks,
+                                                       stream);
+
+    if(result != hipSuccess)
+    {
+        return result;
+    }
 
     hipError_t error;
 
@@ -365,8 +382,7 @@ hipError_t partition_impl(void * temporary_storage,
 /// * Range specified by \p selected_count_output must have at least 1 element.
 /// * Relative order is preserved.
 ///
-/// \tparam Config - [optional] configuration of the primitive. If provided it should be \p
-/// default_config or an instance of \p select_config
+/// \tparam Config - [optional] configuration of the primitive. If has to be an instance of \p select_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. It can be a simple
 /// pointer type.
 /// \tparam SelectedOutputIterator - random-access iterator type of the selected output range. It
@@ -512,8 +528,7 @@ inline hipError_t partition_two_way(void*                       temporary_storag
 /// * Values of \p flag range should be implicitly convertible to `bool` type.
 /// * The relative order of elements in both output ranges matches the input range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. If provided it should be \p
-/// default_config or an instance of \p select_config
+/// \tparam Config - [optional] configuration of the primitive. If has to be an instance of \p select_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. It can be
 /// a simple pointer type.
 /// \tparam FlagIterator - random-access iterator type of the flag range. It can be
@@ -647,8 +662,7 @@ inline hipError_t partition_two_way(void*                       temporary_storag
 /// * Relative order is preserved for the elements for which the corresponding values from \p flags
 /// are \p true. Other elements are copied in reverse order.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p select_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. It can be
 /// a simple pointer type.
 /// \tparam FlagIterator - random-access iterator type of the flag range. It can be
@@ -769,8 +783,7 @@ hipError_t partition(void * temporary_storage,
 /// * Relative order is preserved for the elements for which the \p predicate returns \p true. Other
 /// elements are copied in reverse order.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p select_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. It can be
 /// a simple pointer type.
 /// \tparam OutputIterator - random-access iterator type of the output range. It can be
@@ -914,8 +927,7 @@ hipError_t partition(void * temporary_storage,
 /// minus the number of elements written to \p output_first_part minus the number of elements written
 /// to \p output_second_part.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p select_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. It can be
 /// a simple pointer type.
 /// \tparam FirstOutputIterator - random-access iterator type of the first output range. It can be
diff --git a/rocprim/include/rocprim/device/device_radix_sort.hpp b/rocprim/include/rocprim/device/device_radix_sort.hpp
index b7a16f242..c4676b264 100644
--- a/rocprim/include/rocprim/device/device_radix_sort.hpp
+++ b/rocprim/include/rocprim/device/device_radix_sort.hpp
@@ -620,8 +620,8 @@ inline hipError_t
 
     constexpr bool is_default_config = std::is_same<Config, default_config>::value;
     // if config is not custom, provide default value for merge sort limit
-    constexpr size_t merge_sort_limit = std::
-        conditional<is_default_config, radix_sort_config_v2<>, Config>::type::merge_sort_limit;
+    constexpr size_t merge_sort_limit
+        = std::conditional<is_default_config, radix_sort_config<>, Config>::type::merge_sort_limit;
 
     // Instantiate single sort config to find the threshold that determines which algorithm is used.
 
@@ -732,8 +732,7 @@ inline hipError_t
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it.
 /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
@@ -839,8 +838,7 @@ hipError_t radix_sort_keys(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it.
 /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
@@ -947,8 +945,7 @@ hipError_t radix_sort_keys_desc(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it.
 /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
@@ -1073,8 +1070,7 @@ hipError_t radix_sort_pairs(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it.
 /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
@@ -1199,8 +1195,7 @@ hipError_t radix_sort_pairs_desc(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it.
 /// \tparam Key - key type. Must be an integral type or a floating-point type.
 /// \tparam Size - integral type that represents the problem size.
 ///
@@ -1312,8 +1307,7 @@ hipError_t radix_sort_keys(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it.
 /// \tparam Key - key type. Must be an integral type or a floating-point type.
 /// \tparam Size - integral type that represents the problem size.
 ///
@@ -1425,8 +1419,7 @@ hipError_t radix_sort_keys_desc(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it.
 /// \tparam Key - key type. Must be an integral type or a floating-point type.
 /// \tparam Value - value type.
 /// \tparam Size - integral type that represents the problem size.
@@ -1553,8 +1546,7 @@ hipError_t radix_sort_pairs(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it.
 /// \tparam Key - key type. Must be an integral type or a floating-point type.
 /// \tparam Value - value type.
 /// \tparam Size - integral type that represents the problem size.
diff --git a/rocprim/include/rocprim/device/device_radix_sort_config.hpp b/rocprim/include/rocprim/device/device_radix_sort_config.hpp
index 1aca826e2..7bbdba6b1 100644
--- a/rocprim/include/rocprim/device/device_radix_sort_config.hpp
+++ b/rocprim/include/rocprim/device/device_radix_sort_config.hpp
@@ -48,7 +48,7 @@ template<class SingleSortConfig = default_config,
          class MergeSortConfig  = default_config,
          class OnesweepConfig   = default_config,
          size_t MergeSortLimit  = 1024 * 1024>
-struct radix_sort_config_v2
+struct radix_sort_config
 {
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
     /// \brief Configuration of radix sort single kernel.
@@ -62,52 +62,6 @@ struct radix_sort_config_v2
 #endif
 };
 
-/// \brief Legacy configuration of device-level radix sort operation.
-///
-/// \deprecated Due to a new implementation the configuration options no longer match the algorithm
-/// parameters. Use `radix_sort_config_v2` for the new parameters of the algorithm. Only a best
-/// effort mapping is provided for these options, parameters not applicable to the new algorithm
-/// are ignored.
-///
-/// Radix sort is executed in a single tile (at size < BlocksPerItem) or few iterations (passes)
-/// depending on total number of bits to be sorted (\p begin_bit and \p end_bit), each iteration
-/// sorts either \p LongRadixBits or \p ShortRadixBits bits, chosen to cover whole bit range in
-/// optimal way.
-///
-/// For example, if \p LongRadixBits is 7, \p ShortRadixBits is 6, \p begin_bit is 0 and \p end_bit
-/// is 32 there will be 5 iterations: 7 + 7 + 6 + 6 + 6 = 32 bits.
-///
-/// \tparam LongRadixBits - number of bits in long iterations.
-/// \tparam ShortRadixBits - number of bits in short iterations, must be equal to or less than \p LongRadixBits.
-/// \tparam ScanConfig - configuration of digits scan kernel. Must be \p kernel_config.
-/// \tparam SortConfig - configuration of radix sort kernel. Must be \p kernel_config.
-template<unsigned int LongRadixBits,
-         unsigned int ShortRadixBits,
-         class ScanConfig,
-         class SortConfig,
-         class SortSingleConfig               = kernel_config<256, 10>,
-         class SortMergeConfig                = kernel_config<1024, 1>,
-         unsigned int MergeSizeLimitBlocks    = 1024U,
-         bool         ForceSingleKernelConfig = false,
-         class OnesweepHistogramConfig        = kernel_config<256, 8>,
-         class OnesweepSortConfig             = kernel_config<256, 15>,
-         unsigned int OnesweepRadixBits       = 4>
-struct [[deprecated("use radix_sort_config_v2")]] radix_sort_config
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS
-    /// \brief Configuration of radix sort single kernel.
-    using single_sort_config = SortSingleConfig;
-    /// \brief Configuration of merge sort algorithm.
-    using merge_sort_config = default_config;
-    /// \brief Configuration of radix sort onesweep.
-    using onesweep_config = radix_sort_onesweep_config<OnesweepHistogramConfig,
-                                                       OnesweepSortConfig,
-                                                       OnesweepRadixBits>;
-    /// \brief Maximum number of items to use merge sort algorithm.
-    static constexpr size_t merge_sort_limit = 1024 * MergeSizeLimitBlocks;
-#endif
-};
-
 namespace detail
 {
 
diff --git a/rocprim/include/rocprim/device/device_reduce.hpp b/rocprim/include/rocprim/device/device_reduce.hpp
index 3db6b0661..649e1c583 100644
--- a/rocprim/include/rocprim/device/device_reduce.hpp
+++ b/rocprim/include/rocprim/device/device_reduce.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -256,8 +256,7 @@ hipError_t reduce_impl(void * temporary_storage,
 /// * By default, the input type is used for accumulation. A custom type
 /// can be specified using <tt>rocprim::transform_iterator</tt>, see the example below.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p reduce_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
@@ -404,8 +403,7 @@ hipError_t reduce(void * temporary_storage,
 /// * By default, the input type is used for accumulation. A custom type
 /// can be specified using <tt>rocprim::transform_iterator</tt>, see the example below.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p reduce_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
diff --git a/rocprim/include/rocprim/device/device_reduce_by_key.hpp b/rocprim/include/rocprim/device/device_reduce_by_key.hpp
index 051b63625..315f75668 100644
--- a/rocprim/include/rocprim/device/device_reduce_by_key.hpp
+++ b/rocprim/include/rocprim/device/device_reduce_by_key.hpp
@@ -215,14 +215,20 @@ hipError_t reduce_by_key_impl(void*                     temporary_storage,
     // The running accumulation across the launch boundary.
     accumulator_type* d_previous_accumulated = nullptr;
 
+    detail::temp_storage::layout layout{};
+    const hipError_t             layout_result
+        = scan_state_type::get_temp_storage_layout(number_of_tiles, stream, layout);
+    if(layout_result != hipSuccess)
+    {
+        return layout_result;
+    }
+
     const hipError_t partition_result = detail::temp_storage::partition(
         temporary_storage,
         storage_size,
         detail::temp_storage::make_linear_partition(
             // This is valid even with scan_state_with_sleep_type
-            detail::temp_storage::make_partition(
-                &scan_state_storage,
-                scan_state_type::get_temp_storage_layout(number_of_tiles)),
+            detail::temp_storage::make_partition(&scan_state_storage, layout),
             detail::temp_storage::make_partition(&ordered_bid_storage,
                                                  ordered_tile_id_type::get_temp_storage_layout()),
             detail::temp_storage::ptr_aligned_array(&d_global_head_count, use_limited_size ? 1 : 0),
@@ -239,12 +245,23 @@ hipError_t reduce_by_key_impl(void*                     temporary_storage,
     {
         return result;
     }
+
+    scan_state_type scan_state{};
+    hipError_t      scan_state_result
+        = scan_state_type::create(scan_state, scan_state_storage, number_of_tiles, stream);
+    scan_state_with_sleep_type scan_state_with_sleep{};
+    scan_state_result = scan_state_with_sleep_type::create(scan_state_with_sleep,
+                                                           scan_state_storage,
+                                                           number_of_tiles,
+                                                           stream);
+
+    if(scan_state_result != hipSuccess)
+    {
+        return scan_state_result;
+    }
+
     auto with_scan_state
-        = [use_sleep,
-           scan_state = scan_state_type::create(scan_state_storage, number_of_tiles),
-           scan_state_with_sleep
-           = scan_state_with_sleep_type::create(scan_state_storage, number_of_tiles)](
-              auto&& func) mutable -> decltype(auto)
+        = [use_sleep, scan_state, scan_state_with_sleep](auto&& func) mutable -> decltype(auto)
     {
         if(use_sleep)
         {
@@ -380,8 +397,7 @@ hipError_t reduce_by_key_impl(void*                     temporary_storage,
 /// * Ranges specified by \p unique_output and \p aggregates_output must have at least
 /// <tt>*unique_count_output</tt> (i.e. the number of unique keys) elements.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be `reduce_by_key_config_v2`
-/// or `default_config`
+/// \tparam Config - [optional] configuration of the primitive. It has to be `reduce_by_key_config` or a class derived from it.
 /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the
diff --git a/rocprim/include/rocprim/device/device_reduce_by_key_config.hpp b/rocprim/include/rocprim/device/device_reduce_by_key_config.hpp
index 6bcf9dca3..e38426467 100644
--- a/rocprim/include/rocprim/device/device_reduce_by_key_config.hpp
+++ b/rocprim/include/rocprim/device/device_reduce_by_key_config.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -54,7 +54,7 @@ template<unsigned int         BlockSize,
          block_scan_algorithm ScanAlgorithm    = block_scan_algorithm::using_warp_scan,
          unsigned int         TilesPerBlock    = 1,
          unsigned int         SizeLimit        = ROCPRIM_GRID_SIZE_LIMIT>
-struct reduce_by_key_config_v2
+struct reduce_by_key_config
 {
     /// Number of threads in a block.
     static constexpr unsigned int         block_size         = BlockSize;
@@ -81,25 +81,6 @@ struct reduce_by_key_config_v2
     static constexpr unsigned int         size_limit         = SizeLimit;
 };
 
-/// \brief Legacy configuration of device-level reduce-by-key operation.
-///
-/// \deprecated Due to a new implementation the configuration options no longer match the algorithm
-/// parameters. Use `reduce_by_key_config_v2` for the new parameters of the algorithm. Only a best
-/// effort mapping is provided for these options, parameters not applicable to the new algorithm
-/// are ignored.
-///
-/// \tparam ScanConfig - configuration of carry-outs scan kernel. Must be \p kernel_config.
-/// \tparam ReduceConfig - configuration of the main reduce-by-key kernel. Must be \p kernel_config.
-template<class ScanConfig, class ReduceConfig>
-struct [[deprecated("use reduce_by_key_config_v2")]] reduce_by_key_config
-    : reduce_by_key_config_v2<ReduceConfig::BlockSize, ReduceConfig::ItemsPerThread>
-{
-    /// \brief Configuration of carry-outs scan kernel.
-    using scan = ScanConfig;
-    /// \brief Configuration of the main reduce-by-key kernel.
-    using reduce = ReduceConfig;
-};
-
 namespace detail
 {
 
@@ -117,25 +98,25 @@ struct fallback_config
     static constexpr unsigned int items_per_thread = std::max(1u, 15u / item_scale);
 
     using type
-        = reduce_by_key_config_v2<detail::limit_block_size<256U,
-                                                           items_per_thread * size_memory_per_item,
-                                                           ROCPRIM_WARP_SIZE_64>::value,
-                                  items_per_thread,
-                                  block_load_method::block_load_transpose,
-                                  block_load_method::block_load_transpose,
-                                  block_scan_algorithm::using_warp_scan,
-                                  2>;
+        = reduce_by_key_config<detail::limit_block_size<256U,
+                                                        items_per_thread * size_memory_per_item,
+                                                        ROCPRIM_WARP_SIZE_64>::value,
+                               items_per_thread,
+                               block_load_method::block_load_transpose,
+                               block_load_method::block_load_transpose,
+                               block_scan_algorithm::using_warp_scan,
+                               2>;
 };
 
 template<unsigned int TargetArch, class Key, class Value>
 struct default_config
     : std::conditional_t<std::max(sizeof(Key), sizeof(Value)) <= 16,
-                         rocprim::reduce_by_key_config_v2<256,
-                                                          15,
-                                                          block_load_method::block_load_transpose,
-                                                          block_load_method::block_load_transpose,
-                                                          block_scan_algorithm::using_warp_scan,
-                                                          sizeof(Value) < 16 ? 1 : 2>,
+                         rocprim::reduce_by_key_config<256,
+                                                       15,
+                                                       block_load_method::block_load_transpose,
+                                                       block_load_method::block_load_transpose,
+                                                       block_scan_algorithm::using_warp_scan,
+                                                       sizeof(Value) < 16 ? 1 : 2>,
                          typename reduce_by_key::fallback_config<Key, Value>::type>
 {};
 
diff --git a/rocprim/include/rocprim/device/device_reduce_config.hpp b/rocprim/include/rocprim/device/device_reduce_config.hpp
index f432ddc51..45f046453 100644
--- a/rocprim/include/rocprim/device/device_reduce_config.hpp
+++ b/rocprim/include/rocprim/device/device_reduce_config.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -36,6 +36,9 @@ namespace detail
 template<typename ReduceConfig, typename>
 struct wrapped_reduce_config
 {
+    static_assert(std::is_same<typename ReduceConfig::tag, reduce_config_tag>::value,
+                  "Config must be a specialization of struct template reduce_config");
+
     template<target_arch Arch>
     struct architecture_config
     {
diff --git a/rocprim/include/rocprim/device/device_run_length_encode.hpp b/rocprim/include/rocprim/device/device_run_length_encode.hpp
index d155edb5b..109561cb1 100644
--- a/rocprim/include/rocprim/device/device_run_length_encode.hpp
+++ b/rocprim/include/rocprim/device/device_run_length_encode.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -76,8 +76,7 @@ namespace detail
 /// * Ranges specified by \p unique_output and \p counts_output must have at least
 /// <tt>*runs_count_output</tt> (i.e. the number of runs) elements.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p run_length_encode_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p run_length_encode_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam UniqueOutputIterator - random-access iterator type of the output range. Must meet the
@@ -192,8 +191,7 @@ hipError_t run_length_encode(void * temporary_storage,
 /// * Ranges specified by \p offsets_output and \p counts_output must have at least
 /// <tt>*runs_count_output</tt> (i.e. the number of non-trivial runs) elements.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p run_length_encode_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p run_length_encode_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam OffsetsOutputIterator - random-access iterator type of the output range. Must meet the
diff --git a/rocprim/include/rocprim/device/device_run_length_encode_config.hpp b/rocprim/include/rocprim/device/device_run_length_encode_config.hpp
index c2ace005e..870fd41fc 100644
--- a/rocprim/include/rocprim/device/device_run_length_encode_config.hpp
+++ b/rocprim/include/rocprim/device/device_run_length_encode_config.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -36,7 +36,7 @@ BEGIN_ROCPRIM_NAMESPACE
 /// \brief Configuration of device-level run-length encoding operation.
 ///
 /// \tparam ReduceByKeyConfig - configuration of device-level reduce-by-key operation.
-/// Must be \p reduce_by_key_config_v2 or \p default_config.
+/// Must be \p reduce_by_key_config or \p default_config.
 /// \tparam SelectConfig - configuration of device-level select operation.
 /// Must be \p select_config or \p default_config.
 template<
diff --git a/rocprim/include/rocprim/device/device_scan.hpp b/rocprim/include/rocprim/device/device_scan.hpp
index 003ed4f95..2cb648dad 100644
--- a/rocprim/include/rocprim/device/device_scan.hpp
+++ b/rocprim/include/rocprim/device/device_scan.hpp
@@ -226,14 +226,20 @@ inline auto scan_impl(void*               temporary_storage,
     real_init_value_type* previous_last_element;
     real_init_value_type* new_last_element;
 
+    detail::temp_storage::layout layout{};
+    hipError_t                   layout_result
+        = scan_state_type::get_temp_storage_layout(number_of_blocks, stream, layout);
+    if(layout_result != hipSuccess)
+    {
+        return layout_result;
+    }
+
     const hipError_t partition_result = detail::temp_storage::partition(
         temporary_storage,
         storage_size,
         detail::temp_storage::make_linear_partition(
             // This is valid even with offset_scan_state_with_sleep_type
-            detail::temp_storage::make_partition(
-                &scan_state_storage,
-                scan_state_type::get_temp_storage_layout(number_of_blocks)),
+            detail::temp_storage::make_partition(&scan_state_storage, layout),
             detail::temp_storage::ptr_aligned_array(&previous_last_element,
                                                     use_limited_size ? 1 : 0),
             detail::temp_storage::ptr_aligned_array(&new_last_element, use_limited_size ? 1 : 0)));
@@ -251,9 +257,18 @@ inline auto scan_impl(void*               temporary_storage,
     if(number_of_blocks > 1 || use_limited_size)
     {
         // Create and initialize lookback_scan_state obj
-        auto scan_state = scan_state_type::create(scan_state_storage, number_of_blocks);
-        auto scan_state_with_sleep
-            = scan_state_with_sleep_type::create(scan_state_storage, number_of_blocks);
+        scan_state_type scan_state{};
+        hipError_t      result
+            = scan_state_type::create(scan_state, scan_state_storage, number_of_blocks, stream);
+        scan_state_with_sleep_type scan_state_with_sleep{};
+        result = scan_state_with_sleep_type::create(scan_state_with_sleep,
+                                                    scan_state_storage,
+                                                    number_of_blocks,
+                                                    stream);
+        if(result != hipSuccess)
+        {
+            return result;
+        }
 
         hipDeviceProp_t prop;
         int deviceId;
@@ -442,7 +457,7 @@ inline auto scan_impl(void*               temporary_storage,
 /// * By default, the input type is used for accumulation. A custom type
 /// can be specified using <tt>rocprim::transform_iterator</tt>, see the example below.
 ///
-/// \tparam Config - [optional] configuration of the primitive, should be \p scan_config_v2.
+/// \tparam Config - [optional] configuration of the primitive, has to be \p scan_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
@@ -571,7 +586,7 @@ inline hipError_t inclusive_scan(void*             temporary_storage,
 /// if \p temporary_storage in a null pointer.
 /// * Ranges specified by \p input and \p output must have at least \p size elements.
 ///
-/// \tparam Config - [optional] configuration of the primitive, should be \p scan_config_v2.
+/// \tparam Config - [optional] configuration of the primitive, has to be \p scan_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
diff --git a/rocprim/include/rocprim/device/device_scan_by_key.hpp b/rocprim/include/rocprim/device/device_scan_by_key.hpp
index 7b59a0cac..8d8c4fecb 100644
--- a/rocprim/include/rocprim/device/device_scan_by_key.hpp
+++ b/rocprim/include/rocprim/device/device_scan_by_key.hpp
@@ -155,14 +155,20 @@ inline hipError_t scan_by_key_impl(void* const           temporary_storage,
     void*         scan_state_storage;
     wrapped_type* previous_last_value;
 
+    detail::temp_storage::layout layout{};
+    const hipError_t             layout_result
+        = scan_state_type::get_temp_storage_layout(number_of_blocks, stream, layout);
+    if(layout_result != hipSuccess)
+    {
+        return layout_result;
+    }
+
     const hipError_t partition_result = detail::temp_storage::partition(
         temporary_storage,
         storage_size,
         detail::temp_storage::make_linear_partition(
             // This is valid even with offset_scan_state_with_sleep_type
-            detail::temp_storage::make_partition(
-                &scan_state_storage,
-                scan_state_type::get_temp_storage_layout(number_of_blocks)),
+            detail::temp_storage::make_partition(&scan_state_storage, layout),
             detail::temp_storage::ptr_aligned_array(&previous_last_value,
                                                     use_limited_size ? 1 : 0)));
     if(partition_result != hipSuccess || temporary_storage == nullptr)
@@ -181,14 +187,23 @@ inline hipError_t scan_by_key_impl(void* const           temporary_storage,
         return error;
     }
 
+    scan_state_type scan_state{};
+    hipError_t      scan_state_result
+        = scan_state_type::create(scan_state, scan_state_storage, number_of_blocks, stream);
+    scan_state_with_sleep_type scan_state_with_sleep{};
+    scan_state_result = scan_state_with_sleep_type::create(scan_state_with_sleep,
+                                                           scan_state_storage,
+                                                           number_of_blocks,
+                                                           stream);
+    if(scan_state_result != hipSuccess)
+    {
+        return scan_state_result;
+    }
+
     // Call the provided function with either scan_state or scan_state_with_sleep based on
     // the value of use_sleep_scan_state
     auto with_scan_state
-        = [use_sleep,
-           scan_state = scan_state_type::create(scan_state_storage, number_of_blocks),
-           scan_state_with_sleep
-           = scan_state_with_sleep_type::create(scan_state_storage, number_of_blocks)](
-              auto&& func) mutable -> decltype(auto)
+        = [use_sleep, scan_state, scan_state_with_sleep](auto&& func) mutable -> decltype(auto)
     {
         if(use_sleep)
         {
@@ -305,7 +320,7 @@ inline hipError_t scan_by_key_impl(void* const           temporary_storage,
 /// * Ranges specified by \p keys_input, \p values_input, and \p values_output must have
 /// at least \p size elements.
 ///
-/// \tparam Config - [optional] configuration of the primitive, should be \p scan_by_key_config_v2.
+/// \tparam Config - [optional] configuration of the primitive, has to be \p scan_by_key_config or a class derived from it.
 /// \tparam KeysInputIterator - random-access iterator type of the input range. It can be
 /// a simple pointer type.
 /// \tparam ValuesInputIterator - random-access iterator type of the input range. It can be
@@ -428,7 +443,7 @@ inline hipError_t inclusive_scan_by_key(void* const                temporary_sto
 /// * Ranges specified by \p keys_input, \p values_input, and \p values_output must have
 /// at least \p size elements.
 ///
-/// \tparam Config - [optional] configuration of the primitive, should be \p scan_by_key_config_v2.
+/// \tparam Config - [optional] configuration of the primitive, has to be \p scan_by_key_config or a class derived from it.
 /// \tparam KeysInputIterator - random-access iterator type of the input range. It can be
 /// a simple pointer type.
 /// \tparam ValuesInputIterator - random-access iterator type of the input range. It can be
diff --git a/rocprim/include/rocprim/device/device_scan_by_key_config.hpp b/rocprim/include/rocprim/device/device_scan_by_key_config.hpp
index e18ab12fb..7018b874d 100644
--- a/rocprim/include/rocprim/device/device_scan_by_key_config.hpp
+++ b/rocprim/include/rocprim/device/device_scan_by_key_config.hpp
@@ -32,27 +32,16 @@ BEGIN_ROCPRIM_NAMESPACE
 namespace detail
 {
 
-template<typename ScanByKeyConfig>
-constexpr scan_by_key_config_params wrap_scan_by_key_config()
-{
-    return scan_by_key_config_params{
-        {ScanByKeyConfig::block_size,
-         ScanByKeyConfig::items_per_thread,
-         ScanByKeyConfig::size_limit},
-        ScanByKeyConfig::block_load_method,
-        ScanByKeyConfig::block_store_method,
-        ScanByKeyConfig::block_scan_method
-    };
-}
-
 template<typename ScanByKeyConfig, typename, typename>
 struct wrapped_scan_by_key_config
 {
+    static_assert(std::is_same<typename ScanByKeyConfig::tag, scan_by_key_config_tag>::value,
+                  "Config must be a specialization of struct template scan_by_key_config");
+
     template<target_arch Arch>
     struct architecture_config
     {
-        static constexpr scan_by_key_config_params params
-            = wrap_scan_by_key_config<ScanByKeyConfig>();
+        static constexpr scan_by_key_config_params params = ScanByKeyConfig{};
     };
 };
 
@@ -62,8 +51,8 @@ struct wrapped_scan_by_key_config<default_config, Key, Value>
     template<target_arch Arch>
     struct architecture_config
     {
-        static constexpr scan_by_key_config_params params = wrap_scan_by_key_config<
-            default_scan_by_key_config<static_cast<unsigned int>(Arch), Key, Value>>();
+        static constexpr scan_by_key_config_params params
+            = default_scan_by_key_config<static_cast<unsigned int>(Arch), Key, Value>{};
     };
 };
 
diff --git a/rocprim/include/rocprim/device/device_scan_config.hpp b/rocprim/include/rocprim/device/device_scan_config.hpp
index 1ebf39636..f2a4254da 100644
--- a/rocprim/include/rocprim/device/device_scan_config.hpp
+++ b/rocprim/include/rocprim/device/device_scan_config.hpp
@@ -32,24 +32,15 @@ BEGIN_ROCPRIM_NAMESPACE
 namespace detail
 {
 
-template<typename ScanConfig>
-constexpr scan_config_params wrap_scan_config()
-{
-    return scan_config_params{
-        {ScanConfig::block_size, ScanConfig::items_per_thread, ScanConfig::size_limit},
-        ScanConfig::block_load_method,
-        ScanConfig::block_store_method,
-        ScanConfig::block_scan_method
-    };
-}
-
 template<typename ScanConfig, typename>
 struct wrapped_scan_config
 {
+    static_assert(std::is_same<typename ScanConfig::tag, scan_config_tag>::value,
+                  "Config must be a specialization of struct template scan_config");
     template<target_arch Arch>
     struct architecture_config
     {
-        static constexpr scan_config_params params = wrap_scan_config<ScanConfig>();
+        static constexpr scan_config_params params = ScanConfig{};
     };
 };
 
@@ -60,7 +51,7 @@ struct wrapped_scan_config<default_config, Value>
     struct architecture_config
     {
         static constexpr scan_config_params params
-            = wrap_scan_config<default_scan_config<static_cast<unsigned int>(Arch), Value>>();
+            = default_scan_config<static_cast<unsigned int>(Arch), Value>{};
     };
 };
 
diff --git a/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp b/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp
index 18c710f04..576789f06 100644
--- a/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp
+++ b/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -589,8 +589,8 @@ hipError_t segmented_radix_sort_impl(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be
-/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be
+/// \p segmented_radix_sort_config or a class derived from it.
 /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
@@ -711,8 +711,8 @@ hipError_t segmented_radix_sort_keys(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be
-/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be
+/// \p segmented_radix_sort_config or a class derived from it.
 /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
@@ -834,8 +834,8 @@ hipError_t segmented_radix_sort_keys_desc(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be
-/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be
+/// \p segmented_radix_sort_config or a class derived from it.
 /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
@@ -975,8 +975,8 @@ hipError_t segmented_radix_sort_pairs(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be
-/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be
+/// \p segmented_radix_sort_config or a class derived from it.
 /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the
@@ -1116,8 +1116,8 @@ hipError_t segmented_radix_sort_pairs_desc(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be
-/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be
+/// \p segmented_radix_sort_config or a class derived from it.
 /// \tparam Key - key type. Must be an integral type or a floating-point type.
 /// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
 /// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
@@ -1244,8 +1244,8 @@ hipError_t segmented_radix_sort_keys(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be
-/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be
+/// \p segmented_radix_sort_config or a class derived from it.
 /// \tparam Key - key type. Must be an integral type or a floating-point type.
 /// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
 /// requirements of a C++ OutputIterator concept. It can be a simple pointer type.
@@ -1372,8 +1372,8 @@ hipError_t segmented_radix_sort_keys_desc(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be
-/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be
+/// \p segmented_radix_sort_config or a class derived from it.
 /// \tparam Key - key type. Must be an integral type or a floating-point type.
 /// \tparam Value - value type.
 /// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
@@ -1515,8 +1515,8 @@ hipError_t segmented_radix_sort_pairs(void * temporary_storage,
 /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range
 /// [100, 10000], <tt>begin_bit = 0</tt> and <tt>end_bit = 14</tt> will cover the whole range.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be
-/// \p segmented_radix_sort_config or a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be
+/// \p segmented_radix_sort_config or a class derived from it.
 /// \tparam Key - key type. Must be an integral type or a floating-point type.
 /// \tparam Value - value type.
 /// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the
diff --git a/rocprim/include/rocprim/device/device_segmented_reduce.hpp b/rocprim/include/rocprim/device/device_segmented_reduce.hpp
index aeeb35238..424b291ec 100644
--- a/rocprim/include/rocprim/device/device_segmented_reduce.hpp
+++ b/rocprim/include/rocprim/device/device_segmented_reduce.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -161,8 +161,7 @@ hipError_t segmented_reduce_impl(void * temporary_storage,
 /// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
 /// <tt>offsets + 1</tt> for \p end_offsets.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p reduce_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
diff --git a/rocprim/include/rocprim/device/device_segmented_scan.hpp b/rocprim/include/rocprim/device/device_segmented_scan.hpp
index af1a3e446..4cad6e0f4 100644
--- a/rocprim/include/rocprim/device/device_segmented_scan.hpp
+++ b/rocprim/include/rocprim/device/device_segmented_scan.hpp
@@ -166,8 +166,7 @@ hipError_t segmented_scan_impl(void * temporary_storage,
 /// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
 /// <tt>offsets + 1</tt> for \p end_offsets.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p scan_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
 /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
@@ -281,8 +280,7 @@ hipError_t segmented_inclusive_scan(void * temporary_storage,
 /// <tt>segments + 1</tt> elements: <tt>offsets</tt> for \p begin_offsets and
 /// <tt>offsets + 1</tt> for \p end_offsets.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p scan_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
 /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
@@ -399,8 +397,7 @@ hipError_t segmented_exclusive_scan(void * temporary_storage,
 /// * Ranges specified by \p input, \p output, and \p flags must have at least \p size elements.
 /// * \p value_type of \p HeadFlagIterator iterator should be convertible to \p bool type.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p scan_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
 /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
@@ -512,8 +509,7 @@ hipError_t segmented_inclusive_scan(void * temporary_storage,
 /// * Ranges specified by \p input, \p output, and \p flags must have at least \p size elements.
 /// * \p value_type of \p HeadFlagIterator iterator should be convertible to \p bool type.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p scan_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
 /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
diff --git a/rocprim/include/rocprim/device/device_select.hpp b/rocprim/include/rocprim/device/device_select.hpp
index b2ade6a92..68ce3bbb5 100644
--- a/rocprim/include/rocprim/device/device_select.hpp
+++ b/rocprim/include/rocprim/device/device_select.hpp
@@ -58,8 +58,7 @@ namespace detail
 /// * Range specified by \p selected_count_output must have at least 1 element.
 /// * Values of \p flag range should be implicitly convertible to `bool` type.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p select_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. It can be
 /// a simple pointer type.
 /// \tparam FlagIterator - random-access iterator type of the flag range. It can be
@@ -182,8 +181,7 @@ hipError_t select(void * temporary_storage,
 /// values can be copied into it.
 /// * Range specified by \p selected_count_output must have at least 1 element.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p select_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. It can be
 /// a simple pointer type.
 /// \tparam OutputIterator - random-access iterator type of the output range. It can be
diff --git a/rocprim/include/rocprim/device/device_transform.hpp b/rocprim/include/rocprim/device/device_transform.hpp
index e7fb6cea5..a9de7d827 100644
--- a/rocprim/include/rocprim/device/device_transform.hpp
+++ b/rocprim/include/rocprim/device/device_transform.hpp
@@ -35,11 +35,11 @@
 #include "device_transform_config.hpp"
 #include "detail/device_transform.hpp"
 
-BEGIN_ROCPRIM_NAMESPACE
-
 /// \addtogroup devicemodule
 /// @{
 
+BEGIN_ROCPRIM_NAMESPACE
+
 namespace detail
 {
 
@@ -82,8 +82,7 @@ ROCPRIM_KERNEL
 /// \par Overview
 /// * Ranges specified by \p input and \p output must have at least \p size elements.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p transform_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p transform_config or a class derived from it.
 /// \tparam InputIterator - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the
@@ -208,8 +207,7 @@ inline hipError_t transform(InputIterator     input,
 /// \par Overview
 /// * Ranges specified by \p input1, \p input2, and \p output must have at least \p size elements.
 ///
-/// \tparam Config - [optional] configuration of the primitive. It can be \p transform_config or
-/// a custom class with the same members.
+/// \tparam Config - [optional] configuration of the primitive. It has to be \p transform_config or a class derived from it.
 /// \tparam InputIterator1 - random-access iterator type of the input range. Must meet the
 /// requirements of a C++ InputIterator concept. It can be a simple pointer type.
 /// \tparam InputIterator2 - random-access iterator type of the input range. Must meet the
@@ -285,9 +283,9 @@ hipError_t transform(InputIterator1 input1,
 
 #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
 
+END_ROCPRIM_NAMESPACE
+
 /// @}
 // end of group devicemodule
 
-END_ROCPRIM_NAMESPACE
-
 #endif // ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_
diff --git a/rocprim/include/rocprim/device/device_transform_config.hpp b/rocprim/include/rocprim/device/device_transform_config.hpp
index 3b4ed21cf..bf6c6c1a2 100644
--- a/rocprim/include/rocprim/device/device_transform_config.hpp
+++ b/rocprim/include/rocprim/device/device_transform_config.hpp
@@ -42,25 +42,16 @@ template<unsigned int arch, class value_type>
 struct default_transform_config : default_transform_config_base<value_type>
 {};
 
-template<typename TransformConfig>
-constexpr transform_config_params wrap_transform_config()
-{
-    return transform_config_params{
-        {
-         TransformConfig::block_size,
-         TransformConfig::items_per_thread,
-         TransformConfig::size_limit,
-         }
-    };
-}
-
 template<typename TransformConfig, typename>
 struct wrapped_transform_config
 {
+    static_assert(std::is_base_of<transform_config_tag, typename TransformConfig::tag>::value,
+                  "Config must be a specialization of struct template transform_config");
+
     template<target_arch Arch>
     struct architecture_config
     {
-        static constexpr transform_config_params params = wrap_transform_config<TransformConfig>();
+        static constexpr transform_config_params params = TransformConfig{};
     };
 };
 
@@ -70,8 +61,8 @@ struct wrapped_transform_config<default_config, Value>
     template<target_arch Arch>
     struct architecture_config
     {
-        static constexpr transform_config_params params = wrap_transform_config<
-            default_transform_config<static_cast<unsigned int>(Arch), Value>>();
+        static constexpr transform_config_params params
+            = default_transform_config<static_cast<unsigned int>(Arch), Value>{};
     };
 };
 
diff --git a/rocprim/include/rocprim/intrinsics/thread.hpp b/rocprim/include/rocprim/intrinsics/thread.hpp
index 3a3a664c1..d5949b601 100644
--- a/rocprim/include/rocprim/intrinsics/thread.hpp
+++ b/rocprim/include/rocprim/intrinsics/thread.hpp
@@ -44,24 +44,6 @@ constexpr unsigned int warp_size()
     return warpSize;
 }
 
-/// \brief Returns a number of threads in a hardware warp for the actual device.
-/// At host side this constant is available at runtime time only.
-///
-/// It is constant for a device.
-ROCPRIM_HOST inline
-unsigned int host_warp_size()
-{
-    int default_hip_device;
-    hipError_t success = hipGetDevice(&default_hip_device);
-    hipDeviceProp_t device_prop;
-    success = hipGetDeviceProperties(&device_prop,default_hip_device);
-
-    if(success != hipSuccess)
-        return -1;
-    else
-        return device_prop.warpSize;
-};
-
 /// \brief Returns a number of threads in a hardware warp for the actual target.
 /// At device side this constant is available at compile time.
 ///
diff --git a/rocprim/include/rocprim/intrinsics/warp.hpp b/rocprim/include/rocprim/intrinsics/warp.hpp
index 1e09fd74d..7a25d3cc9 100644
--- a/rocprim/include/rocprim/intrinsics/warp.hpp
+++ b/rocprim/include/rocprim/intrinsics/warp.hpp
@@ -117,18 +117,26 @@ int warp_all(int predicate)
 /// @}
 // end of group intrinsicsmodule
 
-/**
- * This function computes a lane mask of active lanes in the warp which which have
- * the same value for <tt>label</tt> as the lane which calls the function. The bit at
- * index \p i in the lane mask is set if the thread of lane \p i calls this function
- * with the same value <tt>label</tt>. Only the least-significant \p LabelBits bits
- * are taken into account when labels are considered to be equal.
- */
+/// \brief Group active lanes having the same bits of \p label
+///
+/// Threads that have the same least significant \p LabelBits bits are grouped into the same group.
+/// Every lane in the warp receives a mask of all active lanes participating in its group.
+///
+/// \tparam LabelBits number of bits to compare between labels
+///
+/// \param [in] label the label for the calling lane
+/// \param [in] valid lanes passing <tt>false</tt> will be ignored for comparisons,
+/// such lanes will not be part of any group, and will always return an empty mask (0)
+///
+/// \return A bit mask of lanes sharing the same bits for \p label. The bit at index
+/// lane <tt>i</tt>'s result includes bit <tt>j</tt> in the lane mask if lane <tt>j</tt> is part
+/// of the same group as lane <tt>i</tt>, i.e. lane <tt>i</tt> and <tt>j</tt> called with the
+/// same value for label.
 template<unsigned int LabelBits>
-ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type match_any(unsigned int label)
+ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type match_any(unsigned int label, bool valid = true)
 {
     // Obtain a mask with the threads which are currently active.
-    lane_mask_type peer_mask = ballot(1);
+    lane_mask_type peer_mask = ballot(valid);
 
     // Compute the final value iteratively by testing each bit separately.
     ROCPRIM_UNROLL
@@ -141,21 +149,24 @@ ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type match_any(unsigned int label)
         peer_mask &= (bit_set ? same_mask : ~same_mask);
     }
 
-    return peer_mask;
+    return -lane_mask_type{valid} & peer_mask;
 }
 
-/**
- * This function computes a lane mask of active lanes in the warp which which have
- * the same value for <tt>label</tt> as the lane which calls the function. The bit at
- * index \p i in the lane mask is set if the thread of lane \p i calls this function
- * with the same value <tt>label</tt>. Only the least-significant \p LabelBits bits
- * are taken into account when labels are considered to be equal.
- */
-template<int LabelBits>
-[[deprecated("use rocprim::match_any instead")]] ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type
-    MatchAny(unsigned int label)
+/// \brief Elect a single lane for each group in \p mask
+///
+/// \param [in] mask bit mask of the lanes in the same group as the calling lane.
+/// The <tt>i</tt>-th bit should be set if lane <tt>i</tt> is in the same group
+/// as the calling lane.
+///
+/// \returns <tt>true</tt> for one unspecified lane in the <tt>mask</tt>, false for everyone else.
+/// Returns <tt>false</tt> for all lanes not in any group, that is lanes passing 0 as \p mask.
+///
+/// \pre The relation specified by \p mask must be symmetric and transitive, in other words: the groups
+/// should be consistent between threads.
+ROCPRIM_DEVICE ROCPRIM_INLINE bool group_elect(lane_mask_type mask)
 {
-    return match_any<LabelBits>(label);
+    const unsigned int prev_same_count = ::rocprim::masked_bit_count(mask);
+    return prev_same_count == 0 && mask != 0;
 }
 
 END_ROCPRIM_NAMESPACE
diff --git a/rocprim/include/rocprim/rocprim.hpp b/rocprim/include/rocprim/rocprim.hpp
index c2b587f5b..6a2ecabf8 100644
--- a/rocprim/include/rocprim/rocprim.hpp
+++ b/rocprim/include/rocprim/rocprim.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -45,6 +45,7 @@
 #include "block/block_histogram.hpp"
 #include "block/block_load.hpp"
 #include "block/block_radix_sort.hpp"
+#include "block/block_run_length_decode.hpp"
 #include "block/block_scan.hpp"
 #include "block/block_sort.hpp"
 #include "block/block_store.hpp"
diff --git a/rocprim/include/rocprim/thread/thread_search.hpp b/rocprim/include/rocprim/thread/thread_search.hpp
index 54fe38d25..c6b6e7b03 100644
--- a/rocprim/include/rocprim/thread/thread_search.hpp
+++ b/rocprim/include/rocprim/thread/thread_search.hpp
@@ -75,33 +75,24 @@ ROCPRIM_HOST_DEVICE inline void merge_path_search(
     path_coordinate.y = diagonal - split_min;
 }
 
-
-
-
 /// \brief Returns the offset of the first value within \p input which does not compare less than \p val
 /// \tparam InputIteratorT   - <b>[inferred]</b> Type of iterator for the input data to be searched
 /// \tparam OffsetT          - <b>[inferred]</b> The data type of num_items
 /// \tparam T                - <b>[inferred]</b> The data type of the input sequence elements
 /// \param input     [in]    - Input sequence
-/// \param num_items [out]   - Input sequence length
+/// \param num_items [in]    - Input sequence length
 /// \param val       [in]    - Search Key
 /// \return                  - Offset at which val was found
-template <
-    typename InputIteratorT,
-    typename OffsetT,
-    typename T>
-ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT lower_bound(
-    InputIteratorT      input,
-    OffsetT             num_items,
-    T                   val)
+template<typename InputIteratorT, typename OffsetT, typename T>
+ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT lower_bound(InputIteratorT input, OffsetT num_items, T val)
 {
     OffsetT retval = 0;
-    while (num_items > 0)
+    while(num_items > 0)
     {
         OffsetT half = num_items >> 1;
-        if (input[retval + half] < val)
+        if(input[retval + half] < val)
         {
-            retval = retval + (half + 1);
+            retval    = retval + (half + 1);
             num_items = num_items - (half + 1);
         }
         else
@@ -113,35 +104,28 @@ ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT lower_bound(
     return retval;
 }
 
-
 /// \brief Returns the offset of the first value within \p input which compares greater than \p val
 /// \tparam InputIteratorT   - <b>[inferred]</b> Type of iterator for the input data to be searched
 /// \tparam OffsetT          - <b>[inferred]</b> The data type of num_items
 /// \tparam T                - <b>[inferred]</b> The data type of the input sequence elements
 /// \param input     [in]    - Input sequence
-/// \param num_items [out]   - Input sequence length
+/// \param num_items [in]    - Input sequence length
 /// \param val       [in]    - Search Key
 /// \return                  - Offset at which val was found
-template <
-    typename InputIteratorT,
-    typename OffsetT,
-    typename T>
-ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT upper_bound(
-    InputIteratorT      input,              ///< [in] Input sequence
-    OffsetT             num_items,          ///< [in] Input sequence length
-    T                   val)                ///< [in] Search key
+template<typename InputIteratorT, typename OffsetT, typename T>
+ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT upper_bound(InputIteratorT input, OffsetT num_items, T val)
 {
     OffsetT retval = 0;
-    while (num_items > 0)
+    while(num_items > 0)
     {
         OffsetT half = num_items >> 1;
-        if (val < input[retval + half])
+        if(val < input[retval + half])
         {
             num_items = half;
         }
         else
         {
-            retval = retval + (half + 1);
+            retval    = retval + (half + 1);
             num_items = num_items - (half + 1);
         }
     }
@@ -149,6 +133,42 @@ ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT upper_bound(
     return retval;
 }
 
+/// \brief Returns the offset of the first value within \p input which compares greater than \p val
+/// computed as a statically unrolled loop
+/// \tparam MaxNumItems      - The maximum number of items.
+/// \tparam InputIteratorT   - <b>[inferred]</b> Type of iterator for the input data to be searched
+/// \tparam OffsetT          - <b>[inferred]</b> The data type of num_items
+/// \tparam T                - <b>[inferred]</b> The data type of the input sequence elements
+/// \param input     [in]    - Input sequence
+/// \param num_items [in]    - Input sequence length
+/// \param val       [in]    - Search Key
+/// \return                  - Offset at which val was found
+template<int MaxNumItems, typename InputIteratorT, typename OffsetT, typename T>
+ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT static_upper_bound(InputIteratorT input,
+                                                         OffsetT        num_items,
+                                                         T              val)
+{
+    OffsetT lower_bound = 0;
+    OffsetT upper_bound = num_items;
+#pragma unroll
+    for(int i = 0; i <= Log2<MaxNumItems>::VALUE; i++)
+    {
+        OffsetT mid = lower_bound + (upper_bound - lower_bound) / 2;
+        mid         = rocprim::min(mid, num_items - 1);
+
+        if(val < input[mid])
+        {
+            upper_bound = mid;
+        }
+        else
+        {
+            lower_bound = mid + 1;
+        }
+    }
+
+    return lower_bound;
+}
+
 END_ROCPRIM_NAMESPACE
 
 #endif // ROCPRIM_THREAD_THREAD_SCAN_HPP_
diff --git a/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp b/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp
index 8d1bc20f2..df567ae33 100644
--- a/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp
+++ b/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -73,13 +73,7 @@ class warp_reduce_dpp
             // row_shr:8
             output = reduce_op(warp_move_dpp<T, 0x118>(output), output);
         }
-#if ROCPRIM_NAVI
-        if(WarpSize > 16)
-        {
-            // row_bcast:15
-            output = reduce_op(warp_swizzle<T, 0x1e0>(output), output);
-        }
-#else
+#ifdef ROCPRIM_DETAIL_HAS_DPP_BROADCAST
         if(WarpSize > 16)
         {
             // row_bcast:15
@@ -90,6 +84,14 @@ class warp_reduce_dpp
             // row_bcast:31
             output = reduce_op(warp_move_dpp<T, 0x143>(output), output);
         }
+        static_assert(WarpSize <= 64, "WarpSize > 64 is not supported");
+#else
+        if(WarpSize > 16)
+        {
+            // row_bcast:15
+            output = reduce_op(warp_swizzle<T, 0x1e0>(output), output);
+        }
+        static_assert(WarpSize <= 32, "WarpSize > 32 is not supported without DPP broadcasts");
 #endif
         // Read the result from the last lane of the logical warp
         output = warp_shuffle(output, WarpSize - 1, WarpSize);
diff --git a/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp b/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp
index cbe13674f..9ce2350ba 100644
--- a/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp
+++ b/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -74,13 +74,7 @@ class warp_scan_dpp
             T t = scan_op(warp_move_dpp<T, 0x118>(output), output); // row_shr:8
             if(row_lane_id >= 8) output = t;
         }
-#if ROCPRIM_NAVI
-        if(WarpSize > 16)
-        {
-            T t = scan_op(warp_swizzle<T, 0x1e0>(output), output); // row_bcast:15
-            if(lane_id % 32 >= 16) output = t;
-        }
-#else
+#ifdef ROCPRIM_DETAIL_HAS_DPP_BROADCAST
         if(WarpSize > 16)
         {
             T t = scan_op(warp_move_dpp<T, 0x142>(output), output); // row_bcast:15
@@ -91,6 +85,15 @@ class warp_scan_dpp
             T t = scan_op(warp_move_dpp<T, 0x143>(output), output); // row_bcast:31
             if(lane_id >= 32) output = t;
         }
+        static_assert(WarpSize <= 64, "WarpSize > 64 is not supported");
+#else
+        if(WarpSize > 16)
+        {
+            T t = scan_op(warp_swizzle<T, 0x1e0>(output), output); // row_bcast:15
+            if(lane_id % 32 >= 16)
+                output = t;
+        }
+        static_assert(WarpSize <= 32, "WarpSize > 32 is not supported without DPP broadcasts");
 #endif
     }
 
diff --git a/scripts/autotune/create_optimization.py b/scripts/autotune/create_optimization.py
index d8843fa7c..677e02f3e 100755
--- a/scripts/autotune/create_optimization.py
+++ b/scripts/autotune/create_optimization.py
@@ -459,6 +459,22 @@ class AlgorithmDeviceLowerBound(Algorithm):
     def __init__(self, fallback_entries):
         Algorithm.__init__(self, fallback_entries)
 
+class AlgorithmDeviceAdjacentDifference(Algorithm):
+    algorithm_name = 'device_adjacent_difference'
+    cpp_configuration_template_name = 'adjacent_difference_config_template'
+    config_selection_params = [
+            SelectionType(name='value_type', is_optional=False)]
+    def __init__(self, fallback_entries):
+        Algorithm.__init__(self, fallback_entries)
+
+class AlgorithmDeviceAdjacentDifferenceInplace(Algorithm):
+    algorithm_name = 'device_adjacent_difference_inplace'
+    cpp_configuration_template_name = 'adjacent_difference_inplace_config_template'
+    config_selection_params = [
+            SelectionType(name='value_type', is_optional=False)]
+    def __init__(self, fallback_entries):
+        Algorithm.__init__(self, fallback_entries)
+
 def filt_algo_regex(e, algorithm_name):
     if 'algo_regex' in e:
         return re.match(e['algo_regex'], algorithm_name) is not None
@@ -488,6 +504,10 @@ def create_algorithm(algorithm_name: str, fallback_entries):
         return AlgorithmDeviceUpperBound(fallback_entries)
     elif algorithm_name == 'device_lower_bound':
         return AlgorithmDeviceLowerBound(fallback_entries)
+    elif algorithm_name == 'device_adjacent_difference':
+        return AlgorithmDeviceAdjacentDifference(fallback_entries)
+    elif algorithm_name == 'device_adjacent_difference_inplace':
+        return AlgorithmDeviceAdjacentDifferenceInplace(fallback_entries)
     else:
         raise(NotSupportedError(f'Algorithm "{algorithm_name}" is not supported (yet)'))
 
diff --git a/scripts/autotune/templates/adjacent_difference_config_template b/scripts/autotune/templates/adjacent_difference_config_template
new file mode 100644
index 000000000..f40ad24cd
--- /dev/null
+++ b/scripts/autotune/templates/adjacent_difference_config_template
@@ -0,0 +1,20 @@
+{% extends "config_template" %}
+
+{% macro get_header_guard() %}
+ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_HPP_
+{%- endmacro %}
+
+{% macro kernel_configuration(measurement) -%}
+adjacent_difference_config<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}> { };
+{%- endmacro %}
+
+{% macro general_case() -%}
+template<unsigned int arch, class value_type, class enable = void>
+struct default_adjacent_difference_config : default_adjacent_difference_config_base<value_type>
+{};
+{%- endmacro %}
+
+{% macro configuration_fallback(benchmark_of_architecture, based_on_type, fallback_selection_criteria) -%}
+// Based on {{ based_on_type }}
+template<class value_type> struct default_adjacent_difference_config<static_cast<unsigned int>({{ benchmark_of_architecture.name }}), value_type, {{ fallback_selection_criteria }}> :
+{%- endmacro %}
diff --git a/scripts/autotune/templates/adjacent_difference_inplace_config_template b/scripts/autotune/templates/adjacent_difference_inplace_config_template
new file mode 100644
index 000000000..1031bf5e0
--- /dev/null
+++ b/scripts/autotune/templates/adjacent_difference_inplace_config_template
@@ -0,0 +1,20 @@
+{% extends "config_template" %}
+
+{% macro get_header_guard() %}
+ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_INPLACE_HPP_
+{%- endmacro %}
+
+{% macro kernel_configuration(measurement) -%}
+adjacent_difference_config<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}> { };
+{%- endmacro %}
+
+{% macro general_case() -%}
+template<unsigned int arch, class value_type, class enable = void>
+struct default_adjacent_difference_inplace_config : default_adjacent_difference_config_base<value_type>
+{};
+{%- endmacro %}
+
+{% macro configuration_fallback(benchmark_of_architecture, based_on_type, fallback_selection_criteria) -%}
+// Based on {{ based_on_type }}
+template<class value_type> struct default_adjacent_difference_inplace_config<static_cast<unsigned int>({{ benchmark_of_architecture.name }}), value_type, {{ fallback_selection_criteria }}> :
+{%- endmacro %}
diff --git a/scripts/autotune/templates/scan_config_template b/scripts/autotune/templates/scan_config_template
index 2650752b4..02f4fafa5 100644
--- a/scripts/autotune/templates/scan_config_template
+++ b/scripts/autotune/templates/scan_config_template
@@ -5,7 +5,7 @@ ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_SCAN_HPP_
 {%- endmacro %}
 
 {% macro kernel_configuration(measurement) -%}
-scan_config_v2<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}, ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_store_method::block_store_transpose, {{ measurement['cfg']['method'] }}> { };
+scan_config<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}, ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_store_method::block_store_transpose, {{ measurement['cfg']['method'] }}> { };
 {%- endmacro %}
 
 {% macro general_case() -%}
diff --git a/scripts/autotune/templates/scanbykey_config_template b/scripts/autotune/templates/scanbykey_config_template
index c59432354..e17a89de7 100644
--- a/scripts/autotune/templates/scanbykey_config_template
+++ b/scripts/autotune/templates/scanbykey_config_template
@@ -5,7 +5,7 @@ ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_SCAN_BY_KEY_HPP_
 {%- endmacro %}
 
 {% macro kernel_configuration(measurement) -%}
-scan_by_key_config_v2<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}, ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_store_method::block_store_transpose, {{ measurement['cfg']['method'] }}> { };
+scan_by_key_config<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}, ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_store_method::block_store_transpose, {{ measurement['cfg']['method'] }}> { };
 {%- endmacro %}
 
 {% macro general_case() -%}
diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt
index 78e039a95..8db2d5c5b 100644
--- a/test/rocprim/CMakeLists.txt
+++ b/test/rocprim/CMakeLists.txt
@@ -234,6 +234,7 @@ add_rocprim_test("rocprim.block_sort_merge_stable" test_block_sort_merge_stable.
 add_rocprim_test_parallel("rocprim.block_radix_rank" test_block_radix_rank.cpp.in)
 add_rocprim_test("rocprim.block_radix_sort" test_block_radix_sort.cpp)
 add_rocprim_test("rocprim.block_reduce" test_block_reduce.cpp)
+add_rocprim_test("rocprim.block_run_length_decode" test_block_run_length_decode.cpp)
 add_rocprim_test_parallel("rocprim.block_scan" test_block_scan.cpp.in)
 add_rocprim_test("rocprim.block_shuffle" test_block_shuffle.cpp)
 add_rocprim_test("rocprim.block_sort_bitonic" test_block_sort_bitonic.cpp)
diff --git a/test/rocprim/test_block_exchange.kernels.hpp b/test/rocprim/test_block_exchange.kernels.hpp
index f3783c9ff..5c56fff43 100644
--- a/test/rocprim/test_block_exchange.kernels.hpp
+++ b/test/rocprim/test_block_exchange.kernels.hpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -172,15 +172,12 @@ void scatter_to_striped_kernel(Type* device_input, OutputType* device_output, un
 }
 
 // Test for exchange
-template<
-    class T,
-    class U,
-    int Method,
-    unsigned int BlockSize = 256U,
-    unsigned int ItemsPerThread = 1U
->
-auto test_block_exchange()
--> typename std::enable_if<Method == 0>::type
+template<class T,
+         class U,
+         int          Method,
+         unsigned int BlockSize      = 256U,
+         unsigned int ItemsPerThread = 1U>
+auto test_block_exchange(int /*device_id*/) -> typename std::enable_if<Method == 0>::type
 {
     using type = T;
     using output_type = U;
@@ -256,15 +253,12 @@ auto test_block_exchange()
     HIP_CHECK(hipFree(device_output));
 }
 
-template<
-    class T,
-    class U,
-    int Method,
-    unsigned int BlockSize = 256U,
-    unsigned int ItemsPerThread = 1U
->
-auto test_block_exchange()
--> typename std::enable_if<Method == 1>::type
+template<class T,
+         class U,
+         int          Method,
+         unsigned int BlockSize      = 256U,
+         unsigned int ItemsPerThread = 1U>
+auto test_block_exchange(int /*device_id*/) -> typename std::enable_if<Method == 1>::type
 {
     using type = T;
     using output_type = U;
@@ -340,15 +334,12 @@ auto test_block_exchange()
     HIP_CHECK(hipFree(device_output));
 }
 
-template<
-    class T,
-    class U,
-    int Method,
-    unsigned int BlockSize = 256U,
-    unsigned int ItemsPerThread = 1U
->
-auto test_block_exchange()
--> typename std::enable_if<Method == 2>::type
+template<class T,
+         class U,
+         int          Method,
+         unsigned int BlockSize      = 256U,
+         unsigned int ItemsPerThread = 1U>
+auto test_block_exchange(int device_id) -> typename std::enable_if<Method == 2>::type
 {
     using type = T;
     using output_type = U;
@@ -367,8 +358,11 @@ auto test_block_exchange()
     std::vector<output_type> expected(size);
     std::vector<output_type> output(size, output_type(0));
 
-    const size_t warp_size = std::min(block_size, size_t(::rocprim::host_warp_size()));
-    const size_t warps_no = (block_size + warp_size - 1) / warp_size;
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
+
+    const size_t warp_size      = std::min(block_size, size_t(current_device_warp_size));
+    const size_t warps_no       = (block_size + warp_size - 1) / warp_size;
     const size_t items_per_warp = warp_size * items_per_thread;
 
     // Calculate input and expected results on host
@@ -436,15 +430,12 @@ auto test_block_exchange()
     HIP_CHECK(hipFree(device_output));
 }
 
-template<
-    class T,
-    class U,
-    int Method,
-    unsigned int BlockSize = 256U,
-    unsigned int ItemsPerThread = 1U
->
-auto test_block_exchange()
--> typename std::enable_if<Method == 3>::type
+template<class T,
+         class U,
+         int          Method,
+         unsigned int BlockSize      = 256U,
+         unsigned int ItemsPerThread = 1U>
+auto test_block_exchange(int device_id) -> typename std::enable_if<Method == 3>::type
 {
     using type = T;
     using output_type = U;
@@ -463,8 +454,11 @@ auto test_block_exchange()
     std::vector<output_type> expected(size);
     std::vector<output_type> output(size, output_type(0));
 
-    const size_t warp_size = std::min(block_size, size_t(::rocprim::host_warp_size()));
-    const size_t warps_no = (block_size + warp_size - 1) / warp_size;
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
+
+    const size_t warp_size      = std::min(block_size, size_t(current_device_warp_size));
+    const size_t warps_no       = (block_size + warp_size - 1) / warp_size;
     const size_t items_per_warp = warp_size * items_per_thread;
 
     // Calculate input and expected results on host
@@ -530,15 +524,12 @@ auto test_block_exchange()
     HIP_CHECK(hipFree(device_output));
 }
 
-template<
-    class T,
-    class U,
-    int Method,
-    unsigned int BlockSize = 256U,
-    unsigned int ItemsPerThread = 1U
->
-auto test_block_exchange()
--> typename std::enable_if<Method == 4>::type
+template<class T,
+         class U,
+         int          Method,
+         unsigned int BlockSize      = 256U,
+         unsigned int ItemsPerThread = 1U>
+auto test_block_exchange(int /*device_id*/) -> typename std::enable_if<Method == 4>::type
 {
     using type = T;
     using output_type = U;
@@ -632,15 +623,12 @@ auto test_block_exchange()
     HIP_CHECK(hipFree(device_ranks));
 }
 
-template<
-    class T,
-    class U,
-    int Method,
-    unsigned int BlockSize = 256U,
-    unsigned int ItemsPerThread = 1U
->
-auto test_block_exchange()
--> typename std::enable_if<Method == 5>::type
+template<class T,
+         class U,
+         int          Method,
+         unsigned int BlockSize      = 256U,
+         unsigned int ItemsPerThread = 1U>
+auto test_block_exchange(int /*device_id*/) -> typename std::enable_if<Method == 5>::type
 {
     using type = T;
     using output_type = U;
@@ -753,7 +741,7 @@ struct static_for
         SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
         HIP_CHECK(hipSetDevice(device_id));
 
-        test_block_exchange<T, U, Method, BlockSize, items[First]>();
+        test_block_exchange<T, U, Method, BlockSize, items[First]>(device_id);
         static_for<First + 1, Last, T, U, Method, BlockSize>::run();
     }
 };
diff --git a/test/rocprim/test_block_radix_sort.cpp b/test/rocprim/test_block_radix_sort.cpp
index f903cee50..bcc032b2b 100644
--- a/test/rocprim/test_block_radix_sort.cpp
+++ b/test/rocprim/test_block_radix_sort.cpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -40,7 +40,7 @@ struct RocprimBlockRadixSort;
 
 struct Integral;
 #define suite_name RocprimBlockRadixSort
-#define warp_params BlockParamsIntegral
+#define warp_params BlockParamsIntegralExtended
 #define name_suffix Integral
 
 #include "test_block_radix_sort.hpp"
diff --git a/test/rocprim/test_block_run_length_decode.cpp b/test/rocprim/test_block_run_length_decode.cpp
new file mode 100644
index 000000000..c16853af7
--- /dev/null
+++ b/test/rocprim/test_block_run_length_decode.cpp
@@ -0,0 +1,290 @@
+// MIT License
+//
+// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "../common_test_header.hpp"
+
+// required rocprim headers
+#include <gtest/gtest.h>
+#include <rocprim/block/block_run_length_decode.hpp>
+#include <rocprim/config.hpp>
+#include <rocprim/test_utils_data_generation.hpp>
+
+// required test headers
+#include "rocprim/block/block_load_func.hpp"
+#include "rocprim/block/block_store_func.hpp"
+#include "rocprim/functional.hpp"
+#include "test_utils_types.hpp"
+
+template<class ItemT,
+         class LengthT,
+         unsigned BlockSize,
+         unsigned RunsPerThread,
+         unsigned DecodedItemsPerThread>
+struct Params
+{
+    using item_type                                    = ItemT;
+    using length_type                                  = LengthT;
+    static constexpr unsigned block_size               = BlockSize;
+    static constexpr unsigned runs_per_thread          = RunsPerThread;
+    static constexpr unsigned decoded_items_per_thread = DecodedItemsPerThread;
+};
+
+template<class Params>
+class HipcubBlockRunLengthDecodeTest : public ::testing::Test
+{
+public:
+    using params = Params;
+};
+
+using HipcubBlockRunLengthDecodeTestParams
+    = ::testing::Types<Params<int, int, 256, 4, 4>,
+                       Params<double, char, 256, 4, 4>,
+                       Params<char, long long, 256, 4, 4>,
+                       Params<float, int, 256, 4, 4>,
+                       Params<rocprim::half, int, 256, 4, 4>,
+                       Params<rocprim::bfloat16, int, 256, 4, 4>,
+
+                       Params<int, int, 256, 8, 8>,
+                       Params<double, char, 256, 8, 8>,
+                       Params<char, long long, 256, 8, 8>,
+                       Params<float, int, 256, 8, 8>,
+                       Params<rocprim::half, int, 256, 8, 8>,
+                       Params<rocprim::bfloat16, int, 256, 8, 8>,
+
+                       Params<int, int, 256, 1, 14>,
+                       Params<double, char, 256, 1, 14>,
+                       Params<char, long long, 256, 1, 14>,
+                       Params<float, int, 256, 1, 14>,
+                       Params<rocprim::half, int, 256, 1, 14>,
+                       Params<rocprim::bfloat16, int, 256, 1, 14>,
+
+                       Params<int, int, 256, 9, 7>,
+                       Params<double, char, 256, 9, 7>,
+                       Params<char, long long, 256, 9, 7>,
+                       Params<float, int, 256, 9, 7>,
+                       Params<rocprim::half, int, 256, 9, 7>,
+                       Params<rocprim::bfloat16, int, 256, 9, 7>>;
+
+TYPED_TEST_SUITE(HipcubBlockRunLengthDecodeTest, HipcubBlockRunLengthDecodeTestParams);
+
+template<class ItemT,
+         class LengthT,
+         unsigned BlockSize,
+         unsigned RunsPerThread,
+         unsigned DecodedItemsPerThread>
+__global__
+    __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT*   d_run_items,
+                                                                     const LengthT* d_run_lengths,
+                                                                     ItemT*         d_decoded_items,
+                                                                     LengthT* d_decoded_offsets)
+{
+    using BlockRunLengthDecodeT
+        = rocprim::block_run_length_decode<ItemT, BlockSize, RunsPerThread, DecodedItemsPerThread>;
+
+    static constexpr unsigned int decoded_items_per_block = BlockSize * DecodedItemsPerThread;
+
+    ROCPRIM_SHARED_MEMORY typename BlockRunLengthDecodeT::storage_type temp_storage;
+
+    ItemT   run_items[RunsPerThread];
+    LengthT run_lengths[RunsPerThread];
+
+    const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x;
+    rocprim::block_load_direct_blocked(global_thread_idx, d_run_items, run_items);
+    rocprim::block_load_direct_blocked(global_thread_idx, d_run_lengths, run_lengths);
+
+    unsigned              total_decoded_size{};
+    BlockRunLengthDecodeT block_run_length_decode(temp_storage,
+                                                  run_items,
+                                                  run_lengths,
+                                                  total_decoded_size);
+
+    unsigned decoded_window_offset = 0;
+    while(decoded_window_offset < total_decoded_size)
+    {
+        ItemT   decoded_items[DecodedItemsPerThread];
+        LengthT decoded_offsets[DecodedItemsPerThread];
+
+        block_run_length_decode.run_length_decode(decoded_items,
+                                                  decoded_offsets,
+                                                  decoded_window_offset);
+
+        rocprim::block_store_direct_blocked(
+            global_thread_idx,
+            d_decoded_items + decoded_window_offset,
+            decoded_items,
+            rocprim::minimum<unsigned int>{}(total_decoded_size - decoded_window_offset,
+                                             decoded_items_per_block));
+
+        rocprim::block_store_direct_blocked(
+            global_thread_idx,
+            d_decoded_offsets + decoded_window_offset,
+            decoded_offsets,
+            rocprim::minimum<unsigned int>{}(total_decoded_size - decoded_window_offset,
+                                             decoded_items_per_block));
+
+        decoded_window_offset += decoded_items_per_block;
+    }
+}
+
+TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode)
+{
+    const int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id= " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using ItemT                                 = typename TestFixture::params::item_type;
+    using LengthT                               = typename TestFixture::params::length_type;
+    constexpr unsigned block_size               = TestFixture::params::block_size;
+    constexpr unsigned runs_per_thread          = TestFixture::params::runs_per_thread;
+    constexpr unsigned decoded_items_per_thread = TestFixture::params::decoded_items_per_thread;
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed= " << seed_value);
+
+        size_t            num_runs       = runs_per_thread * block_size;
+        constexpr LengthT max_run_length = static_cast<LengthT>(
+            std::min(1000ll, static_cast<long long>(std::numeric_limits<LengthT>::max())));
+
+        auto run_items = std::vector<ItemT>(num_runs);
+        run_items[0] = test_utils::get_random_value<ItemT>(test_utils::numeric_limits<ItemT>::min(),
+                                                           test_utils::numeric_limits<ItemT>::max(),
+                                                           ++seed_value);
+
+        size_t run_item_index = 1;
+        while(run_item_index < num_runs)
+        {
+            run_items[run_item_index]
+                = test_utils::get_random_value<ItemT>(test_utils::numeric_limits<ItemT>::min(),
+                                                      test_utils::numeric_limits<ItemT>::max(),
+                                                      ++seed_value);
+            if(test_utils::convert_to_native(run_items[run_item_index])
+               != test_utils::convert_to_native(run_items[run_item_index - 1]))
+            {
+                ++run_item_index;
+            }
+        }
+
+        auto run_lengths = test_utils::get_random_data<LengthT>(num_runs,
+                                                                static_cast<LengthT>(1),
+                                                                max_run_length,
+                                                                seed_value);
+
+        std::default_random_engine            prng(seed_value);
+        std::uniform_int_distribution<size_t> num_empty_runs_dist(1, 4);
+        const size_t                          num_trailing_empty_runs = num_empty_runs_dist(prng);
+        num_runs += num_trailing_empty_runs;
+
+        const auto empty_run_items
+            = test_utils::get_random_data<ItemT>(num_trailing_empty_runs,
+                                                 std::numeric_limits<ItemT>::min(),
+                                                 std::numeric_limits<ItemT>::max(),
+                                                 seed_value);
+        run_items.insert(run_items.end(), empty_run_items.begin(), empty_run_items.end());
+        run_lengths.insert(run_lengths.end(), num_trailing_empty_runs, static_cast<LengthT>(0));
+
+        std::vector<ItemT> expected;
+        for(size_t i = 0; i < run_items.size(); ++i)
+        {
+            for(size_t j = 0; j < static_cast<size_t>(run_lengths[i]); ++j)
+            {
+                expected.push_back(run_items[i]);
+            }
+        }
+
+        ItemT* d_run_items{};
+        HIP_CHECK(
+            test_common_utils::hipMallocHelper(&d_run_items, run_items.size() * sizeof(ItemT)));
+        HIP_CHECK(hipMemcpy(d_run_items,
+                            run_items.data(),
+                            run_items.size() * sizeof(ItemT),
+                            hipMemcpyHostToDevice));
+
+        LengthT* d_run_lengths{};
+        HIP_CHECK(test_common_utils::hipMallocHelper(&d_run_lengths,
+                                                     run_lengths.size() * sizeof(LengthT)));
+        HIP_CHECK(hipMemcpy(d_run_lengths,
+                            run_lengths.data(),
+                            run_lengths.size() * sizeof(LengthT),
+                            hipMemcpyHostToDevice));
+
+        ItemT* d_decoded_runs{};
+        HIP_CHECK(
+            test_common_utils::hipMallocHelper(&d_decoded_runs, expected.size() * sizeof(ItemT)));
+
+        LengthT* d_decoded_offsets{};
+        HIP_CHECK(test_common_utils::hipMallocHelper(&d_decoded_offsets,
+                                                     expected.size() * sizeof(LengthT)));
+        block_run_length_decode_kernel<ItemT,
+                                       LengthT,
+                                       block_size,
+                                       runs_per_thread,
+                                       decoded_items_per_thread>
+            <<<dim3(1), dim3(block_size), 0, 0>>>(d_run_items,
+                                                  d_run_lengths,
+                                                  d_decoded_runs,
+                                                  d_decoded_offsets);
+
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        std::vector<ItemT> output(expected.size());
+        HIP_CHECK(hipMemcpy(output.data(),
+                            d_decoded_runs,
+                            output.size() * sizeof(ItemT),
+                            hipMemcpyDeviceToHost));
+        HIP_CHECK(hipGetLastError())
+
+        std::vector<LengthT> offsets(expected.size());
+        HIP_CHECK(hipMemcpy(offsets.data(),
+                            d_decoded_offsets,
+                            offsets.size() * sizeof(LengthT),
+                            hipMemcpyDeviceToHost));
+
+        HIP_CHECK(hipFree(d_run_items));
+        HIP_CHECK(hipFree(d_run_lengths));
+        HIP_CHECK(hipFree(d_decoded_runs));
+        HIP_CHECK(hipFree(d_decoded_offsets));
+
+        unsigned int expected_offset = -1;
+        ItemT        previous_value  = ItemT{};
+        for(size_t i = 0; i < output.size(); ++i)
+        {
+            ASSERT_EQ(test_utils::convert_to_native(output[i]),
+                      test_utils::convert_to_native(expected[i]));
+            if(test_utils::convert_to_native(output[i])
+               != test_utils::convert_to_native(previous_value))
+            {
+                previous_value  = output[i];
+                expected_offset = 0;
+            }
+            else
+            {
+                expected_offset = ++expected_offset;
+            }
+
+            ASSERT_EQ(offsets[i], expected_offset);
+        }
+    }
+}
diff --git a/test/rocprim/test_device_adjacent_difference.cpp b/test/rocprim/test_device_adjacent_difference.cpp
index 63d7f2269..784b7b950 100644
--- a/test/rocprim/test_device_adjacent_difference.cpp
+++ b/test/rocprim/test_device_adjacent_difference.cpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -36,28 +36,6 @@
 
 namespace
 {
-template <typename T, unsigned int SizeLimit>
-struct size_limit_config
-{
-    static constexpr unsigned int item_scale
-        = ::rocprim::detail::ceiling_div<unsigned int>(sizeof(T), sizeof(int));
-
-    using type
-        = rocprim::adjacent_difference_config<256,
-                                              ::rocprim::max(1u, 16u / item_scale),
-                                              rocprim::block_load_method::block_load_transpose,
-                                              rocprim::block_store_method::block_store_transpose,
-                                              SizeLimit>;
-};
-
-template <typename T>
-struct size_limit_config<T, ROCPRIM_GRID_SIZE_LIMIT>
-{
-    using type = rocprim::default_config;
-};
-
-template <typename T, unsigned int SizeLimit>
-using size_limit_config_t = typename size_limit_config<T, SizeLimit>::type;
 
 template <typename Config = rocprim::default_config,
           typename InputIt,
@@ -146,12 +124,13 @@ auto get_expected_result(const std::vector<T>& input,
 } // namespace
 
 // Params for tests
-template <class InputType,
-          class OutputType                 = InputType,
-          bool         Left                = true,
-          bool         InPlace             = false,
-          bool         UseIdentityIterator = false,
-          unsigned int SizeLimit           = ROCPRIM_GRID_SIZE_LIMIT>
+template<class InputType,
+         class OutputType         = InputType,
+         bool Left                = true,
+         bool InPlace             = false,
+         bool UseIdentityIterator = false,
+         class Config             = rocprim::default_config>
+
 struct DeviceAdjacentDifferenceParams
 {
     using input_type                              = InputType;
@@ -159,7 +138,7 @@ struct DeviceAdjacentDifferenceParams
     static constexpr bool   left                  = Left;
     static constexpr bool   in_place              = InPlace;
     static constexpr bool   use_identity_iterator = UseIdentityIterator;
-    static constexpr size_t size_limit            = SizeLimit;
+    using config                                  = Config;
 };
 
 template <class Params>
@@ -172,25 +151,38 @@ class RocprimDeviceAdjacentDifferenceTests : public ::testing::Test
     static constexpr bool   in_place              = Params::in_place;
     static constexpr bool   use_identity_iterator = Params::use_identity_iterator;
     static constexpr bool   debug_synchronous     = false;
-    static constexpr size_t size_limit            = Params::size_limit;
+    using config                                  = typename Params::config;
 };
 
 using custom_double2     = test_utils::custom_test_type<double>;
 using custom_int64_array = test_utils::custom_test_array_type<std::int64_t, 8>;
 
+using custom_config_0 = rocprim::adjacent_difference_config<128, 4>;
+
+template<int SizeLimit>
+using custom_size_limit_config
+    = rocprim::adjacent_difference_config<1024,
+                                          2,
+                                          rocprim::block_load_method::block_load_transpose,
+                                          rocprim::block_store_method::block_store_transpose,
+                                          SizeLimit>;
+
 using RocprimDeviceAdjacentDifferenceTestsParams = ::testing::Types<
+    // Tests with default configuration
     DeviceAdjacentDifferenceParams<int>,
     DeviceAdjacentDifferenceParams<float, double, false>,
     DeviceAdjacentDifferenceParams<int8_t, int8_t, true, true>,
     DeviceAdjacentDifferenceParams<custom_double2, custom_double2, false, true>,
-    DeviceAdjacentDifferenceParams<rocprim::bfloat16, float, true, false, false, 512>,
-    DeviceAdjacentDifferenceParams<rocprim::half, rocprim::half, true, true, false, 2048>,
-    DeviceAdjacentDifferenceParams<custom_int64_array,
-                                   custom_int64_array,
-                                   false,
-                                   true,
-                                   true,
-                                   4096>>;
+    DeviceAdjacentDifferenceParams<rocprim::bfloat16, float, true, false, false>,
+    DeviceAdjacentDifferenceParams<rocprim::half, rocprim::half, true, true, false>,
+    DeviceAdjacentDifferenceParams<custom_int64_array, custom_int64_array, false, true, true>,
+    // Tests for supported config structs
+    DeviceAdjacentDifferenceParams<rocprim::bfloat16, float, true, false, false, custom_config_0>,
+    DeviceAdjacentDifferenceParams<rocprim::bfloat16, float, true, false, false>,
+    // Tests for different size_limits
+    DeviceAdjacentDifferenceParams<int, int, true, false, false, custom_size_limit_config<64>>,
+    DeviceAdjacentDifferenceParams<int, int, true, false, false, custom_size_limit_config<8192>>,
+    DeviceAdjacentDifferenceParams<int, int, true, false, false, custom_size_limit_config<10240>>>;
 
 TYPED_TEST_SUITE(RocprimDeviceAdjacentDifferenceTests, RocprimDeviceAdjacentDifferenceTestsParams);
 
@@ -206,7 +198,7 @@ TYPED_TEST(RocprimDeviceAdjacentDifferenceTests, AdjacentDifference)
     static constexpr bool in_place              = TestFixture::in_place;
     static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator;
     static constexpr bool debug_synchronous     = TestFixture::debug_synchronous;
-    using Config                                = size_limit_config_t<T, TestFixture::size_limit>;
+    using Config                                = typename TestFixture::config;
 
     SCOPED_TRACE(testing::Message() << "left = " << left << ", in_place = " << in_place);
 
diff --git a/test/rocprim/test_device_binary_search.cpp b/test/rocprim/test_device_binary_search.cpp
index b94138221..e7370ab61 100644
--- a/test/rocprim/test_device_binary_search.cpp
+++ b/test/rocprim/test_device_binary_search.cpp
@@ -52,37 +52,24 @@ class RocprimDeviceBinarySearch : public ::testing::Test {
 using custom_int2 = test_utils::custom_test_type<int>;
 using custom_double2 = test_utils::custom_test_type<double>;
 
-using custom_config_0 = rocprim::transform_config<128, 4>;
-using custom_config_1 = rocprim::binary_search_config<64, 2>;
-struct custom_config_2
-{
-    static constexpr unsigned int block_size       = 256;
-    static constexpr unsigned int items_per_thread = 1;
-    static constexpr unsigned int size_limit       = ROCPRIM_GRID_SIZE_LIMIT;
-};
-
-typedef ::testing::Types<params<int, int>,
-                         params<unsigned long long,
-                                unsigned long long,
-                                size_t,
-                                rocprim::greater<unsigned long long>,
-                                custom_config_0>,
-                         params<float, double, unsigned int, rocprim::greater<double>>,
-                         params<double, int>,
-                         params<int8_t, int8_t>,
-                         params<uint8_t, uint8_t>,
-                         params<rocprim::half, rocprim::half, size_t, rocprim::less<rocprim::half>>,
-                         params<rocprim::bfloat16,
-                                rocprim::bfloat16,
-                                size_t,
-                                rocprim::less<rocprim::bfloat16>,
-                                custom_config_1>,
-                         params<custom_int2, custom_int2>,
-                         params<custom_double2,
-                                custom_double2,
-                                unsigned int,
-                                rocprim::greater<custom_double2>,
-                                custom_config_2>>
+struct use_custom_config
+{};
+
+typedef ::testing::Types<
+    params<int, int>,
+    params<unsigned long long, unsigned long long, size_t, rocprim::greater<unsigned long long>>,
+    params<float, double, unsigned int, rocprim::greater<double>>,
+    params<double, int>,
+    params<int8_t, int8_t>,
+    params<uint8_t, uint8_t>,
+    params<rocprim::half, rocprim::half, size_t, rocprim::less<rocprim::half>>,
+    params<rocprim::bfloat16,
+           rocprim::bfloat16,
+           size_t,
+           rocprim::less<rocprim::bfloat16>,
+           use_custom_config>,
+    params<custom_int2, custom_int2>,
+    params<custom_double2, custom_double2, unsigned int, rocprim::greater<custom_double2>>>
     Params;
 
 TYPED_TEST_SUITE(RocprimDeviceBinarySearch, Params);
@@ -97,7 +84,10 @@ TYPED_TEST(RocprimDeviceBinarySearch, LowerBound)
     using needle_type = typename TestFixture::params::needle_type;
     using output_type = typename TestFixture::params::output_type;
     using compare_op_type = typename TestFixture::params::compare_op_type;
-    using config          = typename TestFixture::params::config;
+    using config          = std::conditional_t<
+        std::is_same<typename TestFixture::params::config, use_custom_config>::value,
+        rocprim::lower_bound_config<64, 2>,
+        typename TestFixture::params::config>;
 
     hipStream_t stream = 0;
 
@@ -218,7 +208,10 @@ TYPED_TEST(RocprimDeviceBinarySearch, UpperBound)
     using needle_type = typename TestFixture::params::needle_type;
     using output_type = typename TestFixture::params::output_type;
     using compare_op_type = typename TestFixture::params::compare_op_type;
-    using config          = typename TestFixture::params::config;
+    using config          = std::conditional_t<
+        std::is_same<typename TestFixture::params::config, use_custom_config>::value,
+        rocprim::upper_bound_config<64, 2>,
+        typename TestFixture::params::config>;
 
     hipStream_t stream = 0;
 
@@ -338,7 +331,10 @@ TYPED_TEST(RocprimDeviceBinarySearch, BinarySearch)
     using needle_type = typename TestFixture::params::needle_type;
     using output_type = typename TestFixture::params::output_type;
     using compare_op_type = typename TestFixture::params::compare_op_type;
-    using config          = typename TestFixture::params::config;
+    using config          = std::conditional_t<
+        std::is_same<typename TestFixture::params::config, use_custom_config>::value,
+        rocprim::binary_search_config<64, 2>,
+        typename TestFixture::params::config>;
 
     hipStream_t stream = 0;
 
diff --git a/test/rocprim/test_device_histogram.cpp b/test/rocprim/test_device_histogram.cpp
index 03cf47762..20c9c92c1 100644
--- a/test/rocprim/test_device_histogram.cpp
+++ b/test/rocprim/test_device_histogram.cpp
@@ -100,16 +100,6 @@ struct transform_op
     }
 };
 
-// provides the same members as rocprim::histogram_config
-struct user_config
-{
-    using histogram = ::rocprim::kernel_config<256, 1>;
-
-    static constexpr unsigned int max_grid_size          = 1024;
-    static constexpr unsigned int shared_impl_max_bins   = 2048;
-    static constexpr unsigned int shared_impl_histograms = 3;
-};
-
 template<class SampleType,
          unsigned int Bins,
          int          LowerLevel,
@@ -143,7 +133,7 @@ typedef ::testing::Types<params1<int, 10, 0, 10>,
                          params1<unsigned char, 10, 20, 240, unsigned char, unsigned int>,
                          params1<unsigned char, 256, 0, 256, short>,
 
-                         params1<double, 10, 0, 1000, double, int, user_config>,
+                         params1<double, 10, 0, 1000, double, int>,
                          params1<int, 123, 100, 5635, int>,
                          params1<double, 55, -123, +123, double, unsigned int, custom_config1>>
     Params1;
diff --git a/test/rocprim/test_device_radix_sort.cpp.in b/test/rocprim/test_device_radix_sort.cpp.in
index bd37a9fb5..b56e85889 100644
--- a/test/rocprim/test_device_radix_sort.cpp.in
+++ b/test/rocprim/test_device_radix_sort.cpp.in
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -49,6 +49,10 @@
 #endif
 
 #if   ROCPRIM_TEST_TYPE_SLICE == 0
+#if   defined(__GNUC__) || defined(__clang__)
+    INSTANTIATE(params<__int128_t,          __int128_t>)
+    INSTANTIATE(params<__uint128_t,         __uint128_t>)
+#endif
     INSTANTIATE(params<signed char,         double, true>)
     INSTANTIATE(params<int,                 short>)
     INSTANTIATE(params<short,               int,    true>)
diff --git a/test/rocprim/test_device_radix_sort.hpp b/test/rocprim/test_device_radix_sort.hpp
index 628d0f73b..0b7e81620 100644
--- a/test/rocprim/test_device_radix_sort.hpp
+++ b/test/rocprim/test_device_radix_sort.hpp
@@ -139,10 +139,10 @@ inline void sort_keys()
                 test_utils::key_comparator<key_type, descending, start_bit, end_bit>());
 
             // Use arbitrary custom config to increase test coverage without making more test cases
-            using config = rocprim::radix_sort_config_v2<rocprim::default_config,
-                                                         rocprim::default_config,
-                                                         rocprim::default_config,
-                                                         1024 * 512>;
+            using config = rocprim::radix_sort_config<rocprim::default_config,
+                                                      rocprim::default_config,
+                                                      rocprim::default_config,
+                                                      1024 * 512>;
 
             size_t temporary_storage_bytes;
             HIP_CHECK(rocprim::radix_sort_keys<config>(nullptr,
@@ -319,7 +319,7 @@ inline void sort_pairs()
             }
 
             // Use arbitrary custom config to increase test coverage without making more test cases
-            using config = rocprim::radix_sort_config_v2<
+            using config = rocprim::radix_sort_config<
                 rocprim::kernel_config<256, 1>,
                 rocprim::merge_sort_config<128, 64, 2, 128, 64, 2>,
                 rocprim::radix_sort_onesweep_config<rocprim::kernel_config<128, 1>,
diff --git a/test/rocprim/test_device_reduce_by_key.cpp b/test/rocprim/test_device_reduce_by_key.cpp
index 38d728c89..c0a9d8285 100644
--- a/test/rocprim/test_device_reduce_by_key.cpp
+++ b/test/rocprim/test_device_reduce_by_key.cpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -543,3 +543,95 @@ TEST(RocprimDeviceReduceByKey, LargeSegmentCountReduceByKeyLargeValueType)
     // large value type to test TilesPerBlock > 1
     large_segment_count_reduce_by_key<test_utils::custom_test_type<size_t>>();
 }
+
+TEST(RocprimDeviceReduceByKey, ReduceByNonEqualKeys)
+{
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    using key_type   = size_t;
+    using value_type = unsigned int;
+
+    const bool debug_synchronous = false;
+
+    ::rocprim::plus<value_type> reduce_op;
+    auto                        key_compare_op = [](const auto&, const auto&) { return false; };
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        for(size_t block_size_multiple : test_utils::get_block_size_multiples(seed_value, 256))
+        {
+            const size_t size = block_size_multiple + 1;
+
+            SCOPED_TRACE(testing::Message() << "with size = " << size);
+
+            hipStream_t stream = 0; // default
+
+            // Using segments of size 1.
+            auto d_keys_input = rocprim::make_counting_iterator(key_type(0));
+
+            // Setting all values to 1, so the reduction will contain the size of the input array.
+            auto d_values_input = rocprim::constant_iterator<value_type>(1);
+
+            size_t unique_count_expected = size;
+
+            // Discard all output
+            auto d_unique_output     = rocprim::make_discard_iterator();
+            auto d_aggregates_output = rocprim::make_discard_iterator();
+
+            size_t* d_unique_count_output;
+            HIP_CHECK(test_common_utils::hipMallocHelper(&d_unique_count_output, sizeof(size_t)));
+
+            size_t temporary_storage_bytes;
+
+            HIP_CHECK(rocprim::reduce_by_key(nullptr,
+                                             temporary_storage_bytes,
+                                             d_keys_input,
+                                             d_values_input,
+                                             size,
+                                             d_unique_output,
+                                             d_aggregates_output,
+                                             d_unique_count_output,
+                                             reduce_op,
+                                             key_compare_op,
+                                             stream,
+                                             debug_synchronous));
+
+            ASSERT_GT(temporary_storage_bytes, 0);
+
+            void* d_temporary_storage;
+            HIP_CHECK(
+                test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes));
+
+            HIP_CHECK(rocprim::reduce_by_key(d_temporary_storage,
+                                             temporary_storage_bytes,
+                                             d_keys_input,
+                                             d_values_input,
+                                             size,
+                                             d_unique_output,
+                                             d_aggregates_output,
+                                             d_unique_count_output,
+                                             reduce_op,
+                                             key_compare_op,
+                                             stream,
+                                             debug_synchronous));
+
+            HIP_CHECK(hipFree(d_temporary_storage));
+
+            size_t unique_count_output;
+            HIP_CHECK(hipMemcpy(&unique_count_output,
+                                d_unique_count_output,
+                                sizeof(unique_count_output),
+                                hipMemcpyDeviceToHost));
+
+            HIP_CHECK(hipFree(d_unique_count_output));
+
+            ASSERT_EQ(unique_count_output, unique_count_expected);
+        }
+    }
+}
diff --git a/test/rocprim/test_device_scan.cpp b/test/rocprim/test_device_scan.cpp
index 5f7aed218..4277b1cd5 100644
--- a/test/rocprim/test_device_scan.cpp
+++ b/test/rocprim/test_device_scan.cpp
@@ -43,43 +43,24 @@ struct default_config_helper
     using type = ::rocprim::default_config;
 };
 
-struct user_config_helper
-{
-    // provides the same members as rocprim::scan_config and rocprim::scan_by_key_config
-    template<bool /* ByKey */>
-    struct type
-    {
-        static constexpr unsigned int                 block_size       = 256;
-        static constexpr unsigned int                 items_per_thread = 4;
-        static constexpr bool                         use_lookback     = false;
-        static constexpr ::rocprim::block_load_method block_load_method
-            = ::rocprim::block_load_method::default_method;
-        static constexpr ::rocprim::block_store_method block_store_method
-            = ::rocprim::block_store_method::default_method;
-        static constexpr ::rocprim::block_scan_algorithm block_scan_method
-            = ::rocprim::block_scan_algorithm::default_algorithm;
-        static constexpr unsigned int size_limit = ROCPRIM_GRID_SIZE_LIMIT;
-    };
-};
-
 template<unsigned int SizeLimit>
 struct size_limit_config_helper
 {
     template<bool ByKey>
     using type = std::conditional_t<
         ByKey,
-        rocprim::scan_by_key_config_v2<256,
-                                       16,
-                                       rocprim::block_load_method::block_load_transpose,
-                                       rocprim::block_store_method::block_store_transpose,
-                                       rocprim::block_scan_algorithm::using_warp_scan,
-                                       SizeLimit>,
-        rocprim::scan_config_v2<256,
-                                16,
-                                rocprim::block_load_method::block_load_transpose,
-                                rocprim::block_store_method::block_store_transpose,
-                                rocprim::block_scan_algorithm::using_warp_scan,
-                                SizeLimit>>;
+        rocprim::scan_by_key_config<256,
+                                    16,
+                                    rocprim::block_load_method::block_load_transpose,
+                                    rocprim::block_store_method::block_store_transpose,
+                                    rocprim::block_scan_algorithm::using_warp_scan,
+                                    SizeLimit>,
+        rocprim::scan_config<256,
+                             16,
+                             rocprim::block_load_method::block_load_transpose,
+                             rocprim::block_store_method::block_store_transpose,
+                             rocprim::block_scan_algorithm::using_warp_scan,
+                             SizeLimit>>;
 };
 
 // Params for tests
@@ -127,14 +108,14 @@ typedef ::testing::Types<
     DeviceScanParams<int, int, rocprim::plus<int>, false, size_limit_config_helper<524288>>,
     DeviceScanParams<int, int, rocprim::plus<int>, false, size_limit_config_helper<1048576>>,
     DeviceScanParams<int8_t, int8_t, rocprim::maximum<int8_t>>,
-    DeviceScanParams<uint8_t, uint8_t, rocprim::maximum<uint8_t>, false, user_config_helper>,
+    DeviceScanParams<uint8_t, uint8_t, rocprim::maximum<uint8_t>, false>,
     DeviceScanParams<rocprim::half, rocprim::half, rocprim::maximum<rocprim::half>>,
     DeviceScanParams<rocprim::half, float, rocprim::plus<float>>,
     DeviceScanParams<rocprim::bfloat16, rocprim::bfloat16, rocprim::maximum<rocprim::bfloat16>>,
     DeviceScanParams<rocprim::bfloat16, float, rocprim::plus<float>>,
     // Large
     DeviceScanParams<int, double, rocprim::plus<int>>,
-    DeviceScanParams<int, double, rocprim::plus<double>, false, user_config_helper>,
+    DeviceScanParams<int, double, rocprim::plus<double>, false>,
     DeviceScanParams<int, long long, rocprim::plus<long long>>,
     DeviceScanParams<unsigned int, unsigned long long, rocprim::plus<unsigned long long>>,
     DeviceScanParams<long long, long long, rocprim::maximum<long long>>,
diff --git a/test/rocprim/test_intrinsics.cpp b/test/rocprim/test_intrinsics.cpp
index 66751bb39..138705677 100644
--- a/test/rocprim/test_intrinsics.cpp
+++ b/test/rocprim/test_intrinsics.cpp
@@ -163,7 +163,7 @@ struct test_type_helper<custom_16aligned>
         for(size_t i = 0; i < result.size(); ++i)
         {
             result[i].i = static_cast<short>(random_data[i * 3]);
-            result[i].u = static_cast<unsigned int>(random_data[i * 4 + 1]);
+            result[i].u = static_cast<unsigned int>(random_data[i * 3 + 1]);
             result[i].f = random_data[i * 3 + 2];
         }
 
@@ -206,12 +206,13 @@ T bit_extract(const T value, const unsigned int bits)
     return bits == bit_size ? value : value & ((T{1} << bits) - T{1});
 }
 
-std::vector<max_lane_mask_type> active_lanes_tests()
+std::vector<max_lane_mask_type> active_lanes_tests(int device_id)
 {
     std::vector<max_lane_mask_type> tests
         = {all_lanes_active, 0x0123'4567'89AB'CDEF, 0xAAAA'AAAA'AAAA'AAAA};
 
-    const size_t hardware_warp_size = ::rocprim::host_warp_size();
+    unsigned int hardware_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size));
     for(auto& test : tests)
     {
         test = bit_extract(test, hardware_warp_size);
@@ -262,7 +263,8 @@ void test_shuffle()
     SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    const size_t hardware_warp_size = ::rocprim::host_warp_size();
+    unsigned int hardware_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size));
     const size_t size = hardware_warp_size;
 
     SCOPED_TRACE(testing::Message() << "with hardware_warp_size = " << hardware_warp_size);
@@ -313,7 +315,7 @@ void test_shuffle()
         auto           input = test_type_helper<T>::get_random_data(size, seed_value);
         std::vector<T> output(input.size());
 
-        for(const auto active_lanes : active_lanes_tests())
+        for(const auto active_lanes : active_lanes_tests(device_id))
         {
             SCOPED_TRACE(testing::Message()
                          << "with active_lanes = " << std::bitset<64>(active_lanes));
@@ -405,7 +407,8 @@ TYPED_TEST(RocprimIntrinsicsTests, ShuffleIndex)
     HIP_CHECK(hipSetDevice(device_id));
 
     using T = typename TestFixture::type;
-    const size_t hardware_warp_size = ::rocprim::host_warp_size();
+    unsigned int hardware_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size));
     const size_t size = hardware_warp_size;
 
     for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
@@ -512,7 +515,9 @@ TEST(RocprimIntrinsicsTests, LaneId)
     SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    const size_t hardware_warp_size = ::rocprim::host_warp_size();
+    unsigned int hardware_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size));
+
     const size_t warps_per_block    = 4;
     const size_t block_size         = warps_per_block * hardware_warp_size;
     const size_t blocks             = 2;
@@ -571,7 +576,9 @@ TEST(RocprimIntrinsicsTests, MaskedBitCount)
     SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    const size_t hardware_warp_size = ::rocprim::host_warp_size();
+    unsigned int hardware_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size));
+
     const size_t warps_per_block    = 4;
     const size_t block_size         = warps_per_block * hardware_warp_size;
     const size_t blocks             = 2;
@@ -616,7 +623,7 @@ TEST(RocprimIntrinsicsTests, MaskedBitCount)
         {
             SCOPED_TRACE(testing::Message() << "with add = " << add);
 
-            for(const auto active_lanes : active_lanes_tests())
+            for(const auto active_lanes : active_lanes_tests(device_id))
             {
                 SCOPED_TRACE(testing::Message()
                              << "with active_lanes = " << std::bitset<64>(active_lanes));
@@ -696,7 +703,9 @@ void warp_any_all_test()
     SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    const size_t hardware_warp_size = ::rocprim::host_warp_size();
+    unsigned int hardware_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size));
+
     const size_t warps_per_block    = 4;
     const size_t block_size         = warps_per_block * hardware_warp_size;
     const size_t blocks             = 2;
@@ -729,7 +738,7 @@ void warp_any_all_test()
                                                                      all_lanes_active - 1,
                                                                      seed_value);
 
-        for(const auto active_lanes : active_lanes_tests())
+        for(const auto active_lanes : active_lanes_tests(device_id))
         {
             SCOPED_TRACE(testing::Message()
                          << "with active_lanes = " << std::bitset<64>(active_lanes));
@@ -822,7 +831,9 @@ TYPED_TEST(RocprimIntrinsicsTests, WarpPermute)
 
     using T = typename TestFixture::type;
 
-    const size_t hardware_warp_size = ::rocprim::host_warp_size();
+    unsigned int hardware_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size));
+
     const size_t warps_per_block    = 4;
     const size_t block_size         = warps_per_block * hardware_warp_size;
     const size_t blocks             = 2;
@@ -880,7 +891,7 @@ TYPED_TEST(RocprimIntrinsicsTests, WarpPermute)
 
         const auto wrap = test_utils::get_random_data<unsigned int>(size, 0, 4, seed_value);
 
-        for(const auto active_lanes : active_lanes_tests())
+        for(const auto active_lanes : active_lanes_tests(device_id))
         {
             SCOPED_TRACE(testing::Message()
                          << "with active_lanes = " << std::bitset<64>(active_lanes));
@@ -942,14 +953,15 @@ TYPED_TEST(RocprimIntrinsicsTests, WarpPermute)
 template<unsigned int LabelBits>
 __global__ void match_any_kernel(max_lane_mask_type* output,
                                  unsigned int*       input,
-                                 max_lane_mask_type  active_lanes)
+                                 max_lane_mask_type  active_lanes,
+                                 max_lane_mask_type  lane_predicates)
 {
     const unsigned int index = blockIdx.x * blockDim.x + threadIdx.x;
 
-    const auto         value  = input[index];
     max_lane_mask_type result = test_type_helper<max_lane_mask_type>::uninitialized();
     if(is_lane_active(active_lanes, rocprim::lane_id()))
-        result = rocprim::match_any<LabelBits>(value);
+        result = rocprim::match_any<LabelBits>(input[index],
+                                               is_lane_active(lane_predicates, rocprim::lane_id()));
     output[index] = result;
 }
 
@@ -959,7 +971,9 @@ TEST(RocprimIntrinsicsTests, MatchAny)
     SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    const size_t           hardware_warp_size = ::rocprim::host_warp_size();
+    unsigned int hardware_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size));
+
     const size_t           warps_per_block    = 4;
     const size_t           block_size         = warps_per_block * hardware_warp_size;
     const size_t           blocks             = 2;
@@ -987,63 +1001,72 @@ TEST(RocprimIntrinsicsTests, MatchAny)
                                                                      1u << (label_bits + 3),
                                                                      seed_value);
 
-        for(const auto active_lanes : active_lanes_tests())
+        const auto active_lanes_for_testing = active_lanes_tests(device_id);
+        for(const auto& active_lanes : active_lanes_for_testing)
         {
-            SCOPED_TRACE(testing::Message()
-                         << "with active_lanes = " << std::bitset<64>(active_lanes));
-
-            for(size_t block = 0; block < blocks; ++block)
+            for(const auto& lane_predicates : active_lanes_for_testing)
             {
-                for(size_t warp = 0; warp < warps_per_block; ++warp)
-                {
-                    const auto base = (block * warps_per_block + warp) * hardware_warp_size;
-                    std::vector<max_lane_mask_type> histogram(1u << label_bits, 0);
+                SCOPED_TRACE(testing::Message()
+                             << "with lane_predicates = " << std::bitset<64>(lane_predicates));
+                SCOPED_TRACE(testing::Message()
+                             << "with active_lanes = " << std::bitset<64>(active_lanes));
 
-                    for(size_t lane = 0; lane < hardware_warp_size; ++lane)
+                for(size_t block = 0; block < blocks; ++block)
+                {
+                    for(size_t warp = 0; warp < warps_per_block; ++warp)
                     {
-                        if(is_lane_active(active_lanes, lane))
-                        {
-                            const auto value = bit_extract(input[base + lane], label_bits);
-                            histogram[value] |= max_lane_mask_type{1} << lane;
-                        }
-                    }
+                        const auto base = (block * warps_per_block + warp) * hardware_warp_size;
+                        std::vector<max_lane_mask_type> histogram(1u << label_bits, 0);
 
-                    for(size_t lane = 0; lane < hardware_warp_size; ++lane)
-                    {
-                        if(is_lane_active(active_lanes, lane))
+                        for(size_t lane = 0; lane < hardware_warp_size; ++lane)
                         {
-                            const auto value      = bit_extract(input[base + lane], label_bits);
-                            expected[base + lane] = histogram[value];
+                            if(is_lane_active(active_lanes, lane)
+                               && is_lane_active(lane_predicates, lane))
+                            {
+                                const auto value = bit_extract(input[base + lane], label_bits);
+                                histogram[value] |= max_lane_mask_type{1} << lane;
+                            }
                         }
-                        else
+
+                        for(size_t lane = 0; lane < hardware_warp_size; ++lane)
                         {
-                            expected[base + lane] = test_type_helper<unsigned int>::uninitialized();
+                            if(!is_lane_active(active_lanes, lane))
+                            {
+                                expected[base + lane]
+                                    = test_type_helper<unsigned int>::uninitialized();
+                                continue;
+                            }
+
+                            const auto value = bit_extract(input[base + lane], label_bits);
+                            expected[base + lane]
+                                = is_lane_active(lane_predicates, lane) ? histogram[value] : 0;
                         }
                     }
                 }
-            }
 
-            HIP_CHECK(hipMemcpy(d_input,
-                                input.data(),
-                                size * sizeof(unsigned int),
-                                hipMemcpyHostToDevice));
+                HIP_CHECK(hipMemcpy(d_input,
+                                    input.data(),
+                                    size * sizeof(unsigned int),
+                                    hipMemcpyHostToDevice));
 
-            hipLaunchKernelGGL(HIP_KERNEL_NAME(match_any_kernel<label_bits>),
-                               dim3(blocks),
-                               dim3(block_size),
-                               0,
-                               hipStreamDefault,
-                               d_output,
-                               d_input,
-                               active_lanes);
-            HIP_CHECK(hipGetLastError());
+                hipLaunchKernelGGL(HIP_KERNEL_NAME(match_any_kernel<label_bits>),
+                                   dim3(blocks),
+                                   dim3(block_size),
+                                   0,
+                                   hipStreamDefault,
+                                   d_output,
+                                   d_input,
+                                   active_lanes,
+                                   lane_predicates);
+                HIP_CHECK(hipGetLastError());
 
-            HIP_CHECK(hipMemcpy(output.data(),
-                                d_output,
-                                size * sizeof(max_lane_mask_type),
-                                hipMemcpyDeviceToHost));
+                HIP_CHECK(hipMemcpy(output.data(),
+                                    d_output,
+                                    size * sizeof(max_lane_mask_type),
+                                    hipMemcpyDeviceToHost));
 
-            test_utils::assert_eq(output, expected);
+                test_utils::assert_eq(output, expected);
+            }
         }
     }
 
@@ -1069,7 +1092,9 @@ TEST(RocprimIntrinsicsTests, Ballot)
     SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
     HIP_CHECK(hipSetDevice(device_id));
 
-    const size_t hardware_warp_size = ::rocprim::host_warp_size();
+    unsigned int hardware_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size));
+
     const size_t warps_per_block    = 4;
     const size_t block_size         = warps_per_block * hardware_warp_size;
     const size_t blocks             = 2;
@@ -1093,7 +1118,7 @@ TEST(RocprimIntrinsicsTests, Ballot)
 
         const auto input = test_utils::get_random_data01<unsigned int>(size, 0.5f, seed_value);
 
-        for(const auto active_lanes : active_lanes_tests())
+        for(const auto active_lanes : active_lanes_tests(device_id))
         {
             SCOPED_TRACE(testing::Message()
                          << "with active_lanes = " << std::bitset<64>(active_lanes));
@@ -1146,3 +1171,125 @@ TEST(RocprimIntrinsicsTests, Ballot)
     hipFree(d_input);
     hipFree(d_output);
 }
+
+__global__ void group_elect_kernel(max_lane_mask_type* output,
+                                   max_lane_mask_type* input,
+                                   size_t              warps_per_block)
+{
+    const unsigned int input_index = blockIdx.x * blockDim.x + threadIdx.x;
+
+    const unsigned int output_index
+        = blockIdx.x * warps_per_block + threadIdx.x / ::rocprim::device_warp_size();
+
+    if(rocprim::group_elect(input[input_index]))
+    {
+        atomicOr(&output[output_index], max_lane_mask_type{1} << ::rocprim::lane_id());
+    }
+}
+
+TEST(RocprimIntrinsicsTests, GroupElect)
+{
+    const int device_id = test_common_utils::obtain_device_from_ctest();
+    SCOPED_TRACE(testing::Message() << "with device_id = " << device_id);
+    HIP_CHECK(hipSetDevice(device_id));
+
+    unsigned int hardware_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size));
+    const size_t warps_per_block = 4;
+    const size_t block_size      = warps_per_block * hardware_warp_size;
+    const size_t blocks          = 48;
+    const size_t number_of_warps = blocks * warps_per_block;
+    SCOPED_TRACE(testing::Message() << "with hardware_warp_size = " << hardware_warp_size);
+
+    max_lane_mask_type* d_input;
+    HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, blocks * block_size * sizeof(*d_input)));
+
+    max_lane_mask_type* d_output;
+    HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, number_of_warps * sizeof(*d_output)));
+
+    std::vector<max_lane_mask_type> output;
+    output.reserve(number_of_warps);
+
+    for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++)
+    {
+        unsigned int seed_value
+            = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count];
+        SCOPED_TRACE(testing::Message() << "with seed = " << seed_value);
+
+        std::vector<max_lane_mask_type>              input(blocks * block_size, 0);
+        std::vector<std::vector<max_lane_mask_type>> warp_histograms(blocks * warps_per_block);
+
+        auto input_it = input.begin();
+        for(size_t block = 0; block < blocks; ++block)
+        {
+            for(size_t warp = 0; warp < warps_per_block; ++warp)
+            {
+                const std::vector<unsigned int> group_labels
+                    = test_utils::get_random_data<unsigned int>(hardware_warp_size,
+                                                                0,
+                                                                hardware_warp_size,
+                                                                seed_value + warp);
+
+                auto& histogram = warp_histograms[block * warps_per_block + warp];
+                histogram.assign(hardware_warp_size + 1, 0);
+                for(size_t lane = 0; lane < hardware_warp_size; ++lane)
+                {
+                    const unsigned label = group_labels[lane];
+                    histogram[label] |= max_lane_mask_type{1} << lane;
+                }
+
+                input_it
+                    = std::transform(group_labels.begin(),
+                                     group_labels.end(),
+                                     input_it,
+                                     [&](unsigned int label)
+                                     {
+                                         // Mark some lanes as invalid (not part of any group)
+                                         return label < hardware_warp_size ? histogram[label] : 0;
+                                     });
+            }
+        }
+
+        output.assign(number_of_warps, 0);
+
+        HIP_CHECK(hipMemcpy(d_input,
+                            input.data(),
+                            blocks * block_size * sizeof(*d_input),
+                            hipMemcpyHostToDevice));
+
+        HIP_CHECK(hipMemset(d_output, 0, number_of_warps * sizeof(*d_output)));
+
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(group_elect_kernel),
+                           dim3(blocks),
+                           dim3(block_size),
+                           0,
+                           hipStreamDefault,
+                           d_output,
+                           d_input,
+                           warps_per_block);
+        HIP_CHECK(hipGetLastError());
+
+        HIP_CHECK(hipMemcpy(output.data(),
+                            d_output,
+                            number_of_warps * sizeof(output[0]),
+                            hipMemcpyDeviceToHost));
+
+        for(size_t i = 0; i < blocks * block_size; ++i)
+        {
+            const auto group_mask  = input[i];
+            const auto warp_output = output[i / hardware_warp_size];
+            if(group_mask > 0)
+            {
+                const max_lane_mask_type group_elect = group_mask & warp_output;
+                ASSERT_TRUE(rocprim::detail::is_power_of_two(group_elect));
+            }
+            else
+            {
+                ASSERT_EQ(warp_output & (max_lane_mask_type{1} << (i % hardware_warp_size)), 0);
+            }
+        }
+    }
+
+    hipFree(d_input);
+    hipFree(d_output);
+}
diff --git a/test/rocprim/test_utils.hpp b/test/rocprim/test_utils.hpp
index 522c309e3..573b5cbb9 100644
--- a/test/rocprim/test_utils.hpp
+++ b/test/rocprim/test_utils.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -21,11 +21,12 @@
 #ifndef TEST_TEST_UTILS_HPP_
 #define TEST_TEST_UTILS_HPP_
 
-#include <rocprim/types.hpp>
+#include <rocprim/detail/match_result_type.hpp>
+#include <rocprim/device/config_types.hpp>
 #include <rocprim/functional.hpp>
 #include <rocprim/intrinsics.hpp>
 #include <rocprim/type_traits.hpp>
-#include <rocprim/detail/match_result_type.hpp>
+#include <rocprim/types.hpp>
 
 // Identity iterator
 #include "identity_iterator.hpp"
@@ -432,16 +433,16 @@ void iota(ForwardIt first, ForwardIt last, T value)
     }
 }
 
-#define SKIP_IF_UNSUPPORTED_WARP_SIZE(test_warp_size) { \
-    const auto host_warp_size = ::rocprim::host_warp_size(); \
-    if (host_warp_size < (test_warp_size)) \
-    { \
-        GTEST_SKIP() << "Cannot run test of warp size " \
-            << (test_warp_size) \
-            << " on a device with warp size " \
-            << host_warp_size; \
-    } \
-}
+#define SKIP_IF_UNSUPPORTED_WARP_SIZE(test_warp_size, device_id)                \
+    {                                                                           \
+        unsigned int host_warp_size;                                            \
+        HIP_CHECK(::rocprim::host_warp_size(device_id, host_warp_size));        \
+        if(host_warp_size < (test_warp_size))                                   \
+        {                                                                       \
+            GTEST_SKIP() << "Cannot run test of warp size " << (test_warp_size) \
+                         << " on a device with warp size " << host_warp_size;   \
+        }                                                                       \
+    }
 
 template<unsigned int LogicalWarpSize>
 struct DeviceSelectWarpSize
diff --git a/test/rocprim/test_utils_assertions.hpp b/test/rocprim/test_utils_assertions.hpp
index cb5e713c3..3e9ff4e93 100644
--- a/test/rocprim/test_utils_assertions.hpp
+++ b/test/rocprim/test_utils_assertions.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -240,6 +240,81 @@ void assert_bit_eq(const std::vector<T>& result, const std::vector<T>& expected)
     }
 }
 
+#if defined(__GNUC__) || defined(__clang__)
+inline void assert_bit_eq(const std::vector<__int128_t>& result,
+                          const std::vector<__int128_t>& expected)
+{
+    ASSERT_EQ(result.size(), expected.size());
+
+    auto to_string = [](__int128_t value)
+    {
+        static const char* charmap = "0123456789";
+
+        std::string result;
+        result.reserve(41); // max. 40 digits possible ( uint64_t has 20) plus sign
+        __uint128_t helper = (value < 0) ? -value : value;
+
+        do
+        {
+            result += charmap[helper % 10];
+            helper /= 10;
+        }
+        while(helper);
+        if(value < 0)
+        {
+            result += "-";
+        }
+        std::reverse(result.begin(), result.end());
+        return result;
+    };
+
+    for(size_t i = 0; i < result.size(); i++)
+    {
+        if(!bit_equal(result[i], expected[i]))
+        {
+            FAIL() << "Expected strict/bitwise equality of these values: " << std::endl
+                   << "     result[i]: " << to_string(result[i]) << std::endl
+                   << "     expected[i]: " << to_string(expected[i]) << std::endl
+                   << "where index = " << i;
+        }
+    }
+}
+
+inline void assert_bit_eq(const std::vector<__uint128_t>& result,
+                          const std::vector<__uint128_t>& expected)
+{
+    ASSERT_EQ(result.size(), expected.size());
+
+    auto to_string = [](__uint128_t value)
+    {
+        static const char* charmap = "0123456789";
+
+        std::string result;
+        result.reserve(40); // max. 40 digits possible ( uint64_t has 20)
+        __uint128_t helper = value;
+
+        do
+        {
+            result += charmap[helper % 10];
+            helper /= 10;
+        }
+        while(helper);
+        std::reverse(result.begin(), result.end());
+        return result;
+    };
+
+    for(size_t i = 0; i < result.size(); i++)
+    {
+        if(!bit_equal(result[i], expected[i]))
+        {
+            FAIL() << "Expected strict/bitwise equality of these values: " << std::endl
+                   << "     result[i]: " << to_string(result[i]) << std::endl
+                   << "     expected[i]: " << to_string(expected[i]) << std::endl
+                   << "where index = " << i;
+        }
+    }
+}
+#endif
 }
 
 #endif //ROCPRIM_TEST_UTILS_ASSERTIONS_HPP
diff --git a/test/rocprim/test_utils_data_generation.hpp b/test/rocprim/test_utils_data_generation.hpp
index 4c8881f12..74ea08440 100644
--- a/test/rocprim/test_utils_data_generation.hpp
+++ b/test/rocprim/test_utils_data_generation.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -23,10 +23,16 @@
 
 // Std::memcpy and std::memcmp
 #include <cstring>
+#include <vector>
 
-#include "test_utils_half.hpp"
+#include <rocprim/test_seed.hpp>
+#include <rocprim/type_traits.hpp>
+#include <rocprim/types.hpp>
+
+#include "common_test_header.hpp"
 #include "test_utils_bfloat16.hpp"
 #include "test_utils_custom_test_types.hpp"
+#include "test_utils_half.hpp"
 
 namespace test_utils {
 
@@ -113,6 +119,23 @@ template<> class numeric_limits<test_utils::bfloat16> : public std::numeric_limi
 };
 // End of extended numeric_limits
 
+// Converts possible device side types to their relevant host side native types
+inline rocprim::native_half convert_to_native(const rocprim::half& value)
+{
+    return rocprim::native_half(value);
+}
+
+inline rocprim::native_bfloat16 convert_to_native(const rocprim::bfloat16& value)
+{
+    return rocprim::native_bfloat16(value);
+}
+
+template<class T>
+inline auto convert_to_native(const T& value)
+{
+    return value;
+}
+
 // Helper class to generate a vector of special values for any type
 template<class T>
 struct special_values {
@@ -159,6 +182,90 @@ void add_special_values(std::vector<T>& source, seed_type seed_value)
     }
 }
 
+template<class T, class U, class V>
+inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) ->
+    typename std::enable_if<std::is_same<T, __int128_t>::value, std::vector<T>>::type
+{
+    engine_type gen{seed_value};
+    using dis_type = typename std::conditional<
+        is_valid_for_int_distribution<T>::value,
+        T,
+        typename std::conditional<std::is_signed<T>::value, int, unsigned int>::type>::type;
+    std::uniform_int_distribution<dis_type> distribution(static_cast<dis_type>(min),
+                                                         static_cast<dis_type>(max));
+    std::vector<T>                          data(size);
+    size_t                                  segment_size = size / random_data_generation_segments;
+    if(segment_size != 0)
+    {
+        for(uint32_t segment_index = 0; segment_index < random_data_generation_segments;
+            segment_index++)
+        {
+            if(segment_index % random_data_generation_repeat_strides == 0)
+            {
+                T repeated_value = static_cast<T>(distribution(gen));
+                std::fill(data.begin() + segment_size * segment_index,
+                          data.begin() + segment_size * (segment_index + 1),
+                          repeated_value);
+            }
+            else
+            {
+                std::generate(data.begin() + segment_size * segment_index,
+                              data.begin() + segment_size * (segment_index + 1),
+                              [&]() { return static_cast<T>(distribution(gen)); });
+            }
+        }
+    }
+    else
+    {
+        std::generate(data.begin(),
+                      data.end(),
+                      [&]() { return static_cast<T>(distribution(gen)); });
+    }
+    return data;
+}
+
+template<class T, class U, class V>
+inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) ->
+    typename std::enable_if<std::is_same<T, __uint128_t>::value, std::vector<T>>::type
+{
+    engine_type gen{seed_value};
+    using dis_type = typename std::conditional<
+        is_valid_for_int_distribution<T>::value,
+        T,
+        typename std::conditional<std::is_signed<T>::value, int, unsigned int>::type>::type;
+    std::uniform_int_distribution<dis_type> distribution(static_cast<dis_type>(min),
+                                                         static_cast<dis_type>(max));
+    std::vector<T>                          data(size);
+    size_t                                  segment_size = size / random_data_generation_segments;
+    if(segment_size != 0)
+    {
+        for(uint32_t segment_index = 0; segment_index < random_data_generation_segments;
+            segment_index++)
+        {
+            if(segment_index % random_data_generation_repeat_strides == 0)
+            {
+                T repeated_value = static_cast<T>(distribution(gen));
+                std::fill(data.begin() + segment_size * segment_index,
+                          data.begin() + segment_size * (segment_index + 1),
+                          repeated_value);
+            }
+            else
+            {
+                std::generate(data.begin() + segment_size * segment_index,
+                              data.begin() + segment_size * (segment_index + 1),
+                              [&]() { return static_cast<T>(distribution(gen)); });
+            }
+        }
+    }
+    else
+    {
+        std::generate(data.begin(),
+                      data.end(),
+                      [&]() { return static_cast<T>(distribution(gen)); });
+    }
+    return data;
+}
+
 template<class T, class U, class V>
 inline auto get_random_data(size_t size, U min, V max, seed_type seed_value)
     -> typename std::enable_if<rocprim::is_integral<T>::value, std::vector<T>>::type
@@ -431,6 +538,33 @@ std::vector<size_t> get_large_sizes(T seed_value)
     std::sort(sizes.begin(), sizes.end());
     return sizes;
 }
+
+/// \brief Computes the closest multiple of \p divisor to a certain \p ref.
+/// \param ref Number to be rounded up.
+/// \param divisor Number which closest multiple to \p ref we are looking for.
+inline size_t closest_greater_multiple(const size_t ref, const size_t divisor)
+{
+    if(!divisor)
+    {
+        return ref;
+    }
+    const size_t remainder = ref % divisor;
+    size_t       distance  = remainder ? divisor - remainder : 0;
+    return ref + distance;
+}
+
+template<class T>
+std::vector<size_t> get_block_size_multiples(T seed_value, const unsigned int block_size)
+{
+    std::vector<size_t> sizes = get_sizes(seed_value);
+    std::transform(sizes.begin(),
+                   sizes.end(),
+                   sizes.begin(),
+                   [block_size](size_t size)
+                   { return test_utils::closest_greater_multiple(size, block_size); });
+    std::set<size_t> unique_sizes(sizes.begin(), sizes.end());
+    return std::vector<size_t>(unique_sizes.begin(), unique_sizes.end());
+}
 }
 
 #endif //ROCPRIM_TEST_UTILS_DATA_GENERATION_HPP
diff --git a/test/rocprim/test_utils_sort_comparator.hpp b/test/rocprim/test_utils_sort_comparator.hpp
index 71eed3cc7..fa7be26ce 100644
--- a/test/rocprim/test_utils_sort_comparator.hpp
+++ b/test/rocprim/test_utils_sort_comparator.hpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -65,6 +65,46 @@ struct key_comparator<Key,
     }
 };
 
+template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
+struct key_comparator<Key,
+                      Descending,
+                      StartBit,
+                      EndBit,
+                      typename std::enable_if<std::is_same<Key, __int128_t>::value>::type>
+{
+    static constexpr Key radix_mask_upper
+        = EndBit == 8 * sizeof(Key) ? ~Key(0) : (Key(1) << EndBit) - 1;
+    static constexpr Key radix_mask_bottom = (Key(1) << StartBit) - 1;
+    static constexpr Key radix_mask        = radix_mask_upper ^ radix_mask_bottom;
+
+    bool operator()(const Key& lhs, const Key& rhs) const
+    {
+        Key l = lhs & radix_mask;
+        Key r = rhs & radix_mask;
+        return Descending ? (r < l) : (l < r);
+    }
+};
+
+template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
+struct key_comparator<Key,
+                      Descending,
+                      StartBit,
+                      EndBit,
+                      typename std::enable_if<std::is_same<Key, __uint128_t>::value>::type>
+{
+    static constexpr Key radix_mask_upper
+        = EndBit == 8 * sizeof(Key) ? ~Key(0) : (Key(1) << EndBit) - 1;
+    static constexpr Key radix_mask_bottom = (Key(1) << StartBit) - 1;
+    static constexpr Key radix_mask        = radix_mask_upper ^ radix_mask_bottom;
+
+    bool operator()(const Key& lhs, const Key& rhs) const
+    {
+        Key l = lhs & radix_mask;
+        Key r = rhs & radix_mask;
+        return Descending ? (r < l) : (l < r);
+    }
+};
+
 template<class Key, bool Descending, unsigned int StartBit, unsigned int EndBit>
 struct key_comparator<Key,
                       Descending,
diff --git a/test/rocprim/test_utils_types.hpp b/test/rocprim/test_utils_types.hpp
index 0d5c9b7e0..1f793cbda 100644
--- a/test/rocprim/test_utils_types.hpp
+++ b/test/rocprim/test_utils_types.hpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -156,6 +156,13 @@ typedef ::testing::Types<
     block_param_type(int8_t, float)
 > BlockParamsIntegral;
 
+typedef ::testing::Types<block_param_type(int, test_utils::custom_test_type<int>),
+                         block_param_type(uint8_t, short),
+                         block_param_type(int8_t, float),
+                         block_param_type(__uint128_t, short),
+                         block_param_type(__int128_t, float)>
+    BlockParamsIntegralExtended;
+
 typedef ::testing::Types<
     block_param_type(float, long),
     block_param_type(double, test_utils::custom_test_type<double>),
diff --git a/test/rocprim/test_warp_exchange.cpp b/test/rocprim/test_warp_exchange.cpp
index 802dc362f..ba985a904 100644
--- a/test/rocprim/test_warp_exchange.cpp
+++ b/test/rocprim/test_warp_exchange.cpp
@@ -221,7 +221,8 @@ TYPED_TEST(WarpExchangeTest, WarpExchange)
     constexpr unsigned int block_size = warp_size;
     constexpr unsigned int items_count = items_per_thread * block_size;
 
-    SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size);
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size, device_id);
 
     std::vector<T> input(items_count);
     std::iota(input.begin(), input.end(), static_cast<T>(0));
@@ -336,7 +337,8 @@ TYPED_TEST(WarpExchangeScatterTest, WarpExchangeScatter)
     constexpr unsigned int items_count = items_per_thread * block_size;
     using OffsetT = unsigned short;
 
-    SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size);
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size, device_id);
 
     std::vector<T> input(items_count);
     std::iota(input.begin(), input.end(), static_cast<T>(0));
diff --git a/test/rocprim/test_warp_load.cpp b/test/rocprim/test_warp_load.cpp
index cc1592664..e1a14cfa0 100644
--- a/test/rocprim/test_warp_load.cpp
+++ b/test/rocprim/test_warp_load.cpp
@@ -185,7 +185,8 @@ TYPED_TEST(WarpLoadTest, WarpLoad)
     constexpr unsigned int block_size = 1024;
     constexpr unsigned int items_count = items_per_thread * block_size;
 
-    SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size);
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size, device_id);
 
     std::vector<T> input(items_count);
     std::iota(input.begin(), input.end(), static_cast<T>(0));
@@ -238,7 +239,8 @@ TYPED_TEST(WarpLoadTest, WarpLoadGuarded)
     constexpr unsigned int valid_items = warp_size / 4;
     constexpr T oob_default = std::numeric_limits<T>::max();
 
-    SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size);
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size, device_id);
 
     std::vector<T> input(items_count);
     std::iota(input.begin(), input.end(), static_cast<T>(0));
diff --git a/test/rocprim/test_warp_reduce.hpp b/test/rocprim/test_warp_reduce.hpp
index c3c4371f6..02da74dd3 100644
--- a/test/rocprim/test_warp_reduce.hpp
+++ b/test/rocprim/test_warp_reduce.hpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -56,7 +56,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, ReduceSum)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     static constexpr unsigned int grid_size = 4;
@@ -175,7 +176,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, AllReduceSum)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     static constexpr unsigned int grid_size = 4;
@@ -298,7 +300,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, ReduceSumValid)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     static constexpr unsigned int grid_size = 4;
@@ -419,7 +422,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, AllReduceSumValid)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     static constexpr unsigned int grid_size = 4;
@@ -542,7 +546,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, ReduceCustomStruct)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     static constexpr unsigned int grid_size = 4;
@@ -674,7 +679,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, HeadSegmentedReduceSum)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     static constexpr unsigned int grid_size = 4;
@@ -829,7 +835,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, TailSegmentedReduceSum)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     static constexpr unsigned int grid_size = 4;
diff --git a/test/rocprim/test_warp_scan.hpp b/test/rocprim/test_warp_scan.hpp
index 193a69459..4573b22f8 100644
--- a/test/rocprim/test_warp_scan.hpp
+++ b/test/rocprim/test_warp_scan.hpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -56,7 +56,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, InclusiveScan)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     const unsigned int grid_size = 4;
@@ -178,7 +179,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, InclusiveScanReduce)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     const unsigned int grid_size = 4;
@@ -322,7 +324,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, ExclusiveScan)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     const unsigned int grid_size = 4;
@@ -447,7 +450,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, ExclusiveReduceScan)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     const unsigned int grid_size = 4;
@@ -599,7 +603,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, Scan)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     const unsigned int grid_size = 4;
@@ -757,7 +762,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, ScanReduce)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     const unsigned int grid_size = 4;
@@ -936,7 +942,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, InclusiveScanCustomType)
             ? rocprim::max<size_t>(ws64, logical_warp_size * 4)
             : rocprim::max<size_t>((ws64/logical_warp_size), 1) * logical_warp_size;
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
 
     const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64;
     const unsigned int grid_size = 4;
diff --git a/test/rocprim/test_warp_sort.hpp b/test/rocprim/test_warp_sort.hpp
index a1835c830..b1da8d1b5 100644
--- a/test/rocprim/test_warp_sort.hpp
+++ b/test/rocprim/test_warp_sort.hpp
@@ -1,6 +1,6 @@
 // MIT License
 //
-// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
@@ -41,7 +41,8 @@ typed_test_def(RocprimWarpSortShuffleBasedTests, name_suffix, Sort)
     static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
     static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
     static constexpr size_t block_size = std::max<size_t>(256U, logical_warp_size * 4);
 
     static constexpr unsigned int grid_size = 4;
@@ -139,7 +140,8 @@ typed_test_def(RocprimWarpSortShuffleBasedTests, name_suffix, SortKeyInt)
     static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32);
     static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64);
 
-    const unsigned int current_device_warp_size = rocprim::host_warp_size();
+    unsigned int current_device_warp_size;
+    HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size));
     static constexpr size_t block_size = std::max<size_t>(256U, logical_warp_size * 4);
 
     static constexpr unsigned int grid_size = 4;
diff --git a/test/rocprim/test_warp_store.cpp b/test/rocprim/test_warp_store.cpp
index 872d96328..2d6a0430a 100644
--- a/test/rocprim/test_warp_store.cpp
+++ b/test/rocprim/test_warp_store.cpp
@@ -175,7 +175,8 @@ TYPED_TEST(WarpStoreTest, WarpLoad)
     constexpr unsigned int block_size = 1024;
     constexpr unsigned int items_count = items_per_thread * block_size;
 
-    SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size);
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size, device_id);
 
     std::vector<T> input(items_count);
     std::iota(input.begin(), input.end(), static_cast<T>(0));
@@ -227,7 +228,8 @@ TYPED_TEST(WarpStoreTest, WarpStoreGuarded)
     constexpr unsigned items_count = items_per_thread * block_size;
     constexpr int valid_items = warp_size / 4;
 
-    SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size);
+    int device_id = test_common_utils::obtain_device_from_ctest();
+    SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size, device_id);
 
     std::vector<T> input(items_count);
     std::iota(input.begin(), input.end(), static_cast<T>(0));