diff --git a/.clang-format b/.clang-format index 0395753c8..605a54fdc 100644 --- a/.clang-format +++ b/.clang-format @@ -58,8 +58,6 @@ BraceWrapping: AfterNamespace: true AfterStruct: true AfterUnion: true - BeforeCatch: true - BeforeElse: true AfterExternBlock: false BeforeCatch: true BeforeElse: true diff --git a/.gitignore b/.gitignore index 3bee42a2a..42ae29137 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,14 @@ ### Build dirs ### -build/ +build*/ + +### clangd. ### +/.cache + +### Docs dirs ### +doc/html/ +doc/xml/ +doc/latex/ +doc/*.tag # Created by https://www.gitignore.io/api/c++,cmake diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 979c4af77..0229df12c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -230,9 +230,7 @@ build:package: build:benchmark: stage: build - needs: - - job: "autotune:generate-config" - optional: true + needs: [] tags: - rocm-build extends: @@ -270,7 +268,7 @@ autotune:build: extends: - .cmake-minimum - .gpus:rocm-gpus - - .rules:manual + - .rules:benchmark variables: BENCHMARK_TARGETS: benchmark_config_tuning script: @@ -282,6 +280,7 @@ autotune:build: -S $CI_PROJECT_DIR -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" + -D CMAKE_CXX_FLAGS="-Wno-#pragma-messages" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=OFF -D BUILD_EXAMPLE=OFF @@ -472,11 +471,7 @@ autotune:execute-tuning: # Exclude benchmark that is known to fail on gfx906 # On ROCm 5.7 or later, check if this can be removed - the presumption is that the failure is caused by a compiler issue. - > - if [[ "${GPU_TARGET}" == "gfx906" ]] && [[ "${AUTOTUNE_ALGORITHM_REGEX}" == "" ]]; then - export AUTOTUNE_ALGORITHM_REGEX="-\{\"lvl\":\"device\",\"algo\":\"radix_sort_onesweep\",\"key_type\":\"short\",\"value_type\":\"short\",\"cfg\":\{\"histogram\":\{\"bs\":1024,\"ipt\":22},\"sort\":\{\"bs\":1024,\"ipt\":22},\"bits_per_place\":5,\"algorithm\":\"block_radix_rank_algorithm::match\"}}" - fi - - 'printf "CI Variables used in benchmarks:\nAUTOTUNE_RESULT_DIR: %s\nAUTOTUNE_FILENAME_REGEX: %s\nAUTOTUNE_ALGORITHM_REGEX: %s \nAUTOTUNE_SIZE: %s \nAUTOTUNE_TRIALS: %s\n" "$AUTOTUNE_RESULT_DIR" "$AUTOTUNE_FILENAME_REGEX" "$AUTOTUNE_ALGORITHM_REGEX" "$AUTOTUNE_SIZE" "$AUTOTUNE_TRIALS"' - - cd "${CI_PROJECT_DIR}" + cd "${CI_PROJECT_DIR}" - mkdir -p "${AUTOTUNE_RESULT_DIR}" - python3 .gitlab/run_benchmarks.py diff --git a/CHANGELOG.md b/CHANGELOG.md index ab972ba62..693593bc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,19 +2,39 @@ Full documentation for rocPRIM is available at [https://rocprim.readthedocs.io/en/latest/](https://rocprim.readthedocs.io/en/latest/) +## [Unreleased rocPRIM-3.0.0 for ROCm 6.1.0] +### Added + - Added new primitive: `block_run_length_decode`. +### Changed +- Removed deprecated functionality: `reduce_by_key_config`, `MatchAny`, `scan_config`, `scan_by_key_config` and `radix_sort_config`. +- Renamed `scan_config_v2` to `scan_config`, `scan_by_key_config_v2` to `scan_by_key_config`, `radix_sort_config_v2` to `radix_sort_config`, `reduce_by_key_config_v2` to `reduce_by_key_config`, `radix_sort_config_v2` to `radix_sort_config`. +- Removed support for custom config types for device algorithms. +- `host_warp_size()` was moved into `rocprim/device/config_types.hpp`, and now uses either a `device_id` or a `stream` parameter to query the proper device and a `device_id` out parameter. The return type is `hipError_t`. +- Added support for __int128_t in `device_radix_sort` and `block_radix_sort`. +### Fixed +- Fixed build issues with `rmake.py` on Windows when using VS 2017 15.8 or later due to a breaking fix with extended aligned storage. + ## [Unreleased rocPRIM-2.13.1 for ROCm 5.7.0] ### Added - `block_sort::sort()` overload for keys and values with a dynamic size, for all block sort algorithms. Additionally, all `block_sort::sort()` overloads with a dynamic size are now supported for `block_sort_algorithm::merge_sort` and `block_sort_algorithm::bitonic_sort`. - New two-way partition primitive `partition_two_way` which can write to two separate iterators. +- Added config tuning and dynamic dispatch to `device_adjacent_difference` algorithm +- New `rocprim::group_elect` warp intrinsic, which chooses one lane from the lanes enabled by a mask. ### Changed - Deprecated configuration `radix_sort_config` for device-level radix sort as it no longer matches the algorithm's parameters. New configuration `radix_sort_config_v2` is preferred instead. - Removed erroneous implementation of device-level `inclusive_scan` and `exclusive_scan`. The prior default implementation using lookback-scan now is the only available implementation. - The benchmark metric indicating the bytes processed for `exclusive_scan_by_key` and `inclusive_scan_by_key` has been changed to incorporate the key type. Furthermore, the benchmark log has been changed such that these algorithms are reported as `scan` and `scan_by_key` instead of `scan_exclusive` and `scan_inclusive`. - Deprecated configurations `scan_config` and `scan_by_key_config` for device-level scans, as they no longer match the algorithm's parameters. New configurations `scan_config_v2` and `scan_by_key_config_v2` are preferred instead. - Improved the performance of `partition`. +- `merge_sort_block_sort` will always use stable merge sort as it is faster than the fallback implementation. +- The `rocprim::match_any` interface has a new parameter, `valid` to enalble/disable lanes. The default value is true, so it doesn't change the previous behaviour. ### Fixed - Fixed build issue caused by missing header in `thread/thread_search.hpp`. - Fixed `rocprim::MatchAny` for devices with 64-bit warp size. The function `rocprim::MatchAny` is deprecated and `rocprim::match_any` is preferred instead. +- Fixed `device_adjacent_difference` using more shared memory than required. +- Fixed a compilation error when `ROCPRIM_DISABLE_DPP` is defined. +- rocPRIM should be more robust for detecting GPU architecture features. Explicitly listing each architecture is no longer required by developers, fixing compilation failures when + targeting devices not known by rocPRIM. ## [rocPRIM-2.13.0 for ROCm 5.5.0] ### Added diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index c65de966c..8087b43ff 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -123,6 +123,7 @@ add_rocprim_benchmark(benchmark_block_histogram.cpp) add_rocprim_benchmark(benchmark_block_radix_sort.cpp) add_rocprim_benchmark(benchmark_block_radix_rank.cpp) add_rocprim_benchmark(benchmark_block_reduce.cpp) +add_rocprim_benchmark(benchmark_block_run_length_decode.cpp) add_rocprim_benchmark(benchmark_block_scan.cpp) add_rocprim_benchmark(benchmark_block_sort.cpp) add_rocprim_benchmark(benchmark_config_dispatch.cpp) diff --git a/benchmark/ConfigAutotuneSettings.cmake b/benchmark/ConfigAutotuneSettings.cmake index d1fcd2490..510c222ad 100644 --- a/benchmark/ConfigAutotuneSettings.cmake +++ b/benchmark/ConfigAutotuneSettings.cmake @@ -29,10 +29,10 @@ set(LIMITED_TUNING_TYPES "int64_t int short int8_t") function(read_config_autotune_settings file list_across_names list_across output_pattern_suffix) if(file STREQUAL "benchmark_device_adjacent_difference") - set(list_across_names "DataType;Left;InPlace;BlockSize;ItemsPerThread" PARENT_SCOPE) + set(list_across_names "DataType;Left;InPlace;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};\ -true false;true false;64 128;1 2 4 8 16" PARENT_SCOPE) - set(output_pattern_suffix "@DataType@_@Left@_@InPlace@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE) +true;false true;32 64 128 256 512 1024" PARENT_SCOPE) + set(output_pattern_suffix "@DataType@_@Left@_@InPlace@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_histogram") set(list_across_names "DataType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};64 128 256" PARENT_SCOPE) diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp new file mode 100644 index 000000000..04e1f0428 --- /dev/null +++ b/benchmark/benchmark_block_run_length_decode.cpp @@ -0,0 +1,242 @@ +// MIT License +// +// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "benchmark/benchmark.h" +#include "benchmark_utils.hpp" +#include "cmdparser.hpp" + +#include "rocprim/block/block_load.hpp" +#include "rocprim/block/block_run_length_decode.hpp" +#include "rocprim/block/block_store.hpp" + +#include +#include + +#ifndef DEFAULT_N +const size_t DEFAULT_N = 1024 * 1024 * 32; +#endif + +template +__global__ + __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT* d_run_items, + const OffsetT* d_run_offsets, + ItemT* d_decoded_items, + bool enable_store = false) +{ + using BlockRunLengthDecodeT + = rocprim::block_run_length_decode; + + ItemT run_items[RunsPerThread]; + OffsetT run_offsets[RunsPerThread]; + + const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; + rocprim::block_load_direct_blocked(global_thread_idx, d_run_items, run_items); + rocprim::block_load_direct_blocked(global_thread_idx, d_run_offsets, run_offsets); + + ROCPRIM_SHARED_MEMORY typename BlockRunLengthDecodeT::storage_type temp_storage; + BlockRunLengthDecodeT block_run_length_decode(run_items, run_offsets); + + const OffsetT total_decoded_size + = d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] + - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; + +#pragma nounroll + for(unsigned i = 0; i < Trials; ++i) + { + OffsetT decoded_window_offset = 0; + while(decoded_window_offset < total_decoded_size) + { + ItemT decoded_items[DecodedItemsPerThread]; + block_run_length_decode.run_length_decode(decoded_items, decoded_window_offset); + + if(enable_store) + { + rocprim::block_store_direct_blocked(global_thread_idx, + d_decoded_items + decoded_window_offset, + decoded_items); + } + + decoded_window_offset += BlockSize * DecodedItemsPerThread; + } + } +} + +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + constexpr auto runs_per_block = BlockSize * RunsPerThread; + const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength); + const auto num_runs + = runs_per_block * ((target_num_runs + runs_per_block - 1) / runs_per_block); + + std::vector run_items(num_runs); + std::vector run_offsets(num_runs + 1); + + std::default_random_engine prng(std::random_device{}()); + using ItemDistribution = std::conditional_t::value, + std::uniform_int_distribution, + std::uniform_real_distribution>; + ItemDistribution run_item_dist(0, 100); + std::uniform_int_distribution run_length_dist(MinRunLength, MaxRunLength); + + for(size_t i = 0; i < num_runs; ++i) + { + run_items[i] = run_item_dist(prng); + } + for(size_t i = 1; i < num_runs + 1; ++i) + { + const OffsetT next_run_length = run_length_dist(prng); + run_offsets[i] = run_offsets[i - 1] + next_run_length; + } + const OffsetT output_length = run_offsets.back(); + + ItemT* d_run_items{}; + HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT))); + HIP_CHECK(hipMemcpy(d_run_items, + run_items.data(), + run_items.size() * sizeof(ItemT), + hipMemcpyHostToDevice)); + + OffsetT* d_run_offsets{}; + HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT))); + HIP_CHECK(hipMemcpy(d_run_offsets, + run_offsets.data(), + run_offsets.size() * sizeof(OffsetT), + hipMemcpyHostToDevice)); + + ItemT* d_output{}; + HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT))); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(block_run_length_decode_kernel), + dim3(num_runs / runs_per_block), + dim3(BlockSize), + 0, + stream, + d_run_items, + d_run_offsets, + d_output); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * output_length * sizeof(ItemT) * Trials); + state.SetItemsProcessed(state.iterations() * output_length * Trials); + + HIP_CHECK(hipFree(d_run_items)); + HIP_CHECK(hipFree(d_run_offsets)); + HIP_CHECK(hipFree(d_output)); +} + +#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ + benchmark::RegisterBenchmark("block_run_length_decode", \ + &run_benchmark, \ + stream, \ + size) + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_run_length_decode" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks{ + CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 100, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 500, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 1000, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 5000, 128, 2, 4), + + CREATE_BENCHMARK(double, long long, 1, 5, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 10, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 50, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4)}; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } + } + + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; +} diff --git a/benchmark/benchmark_device_adjacent_difference.parallel.cpp.in b/benchmark/benchmark_device_adjacent_difference.parallel.cpp.in index 0892d2660..03c316d5b 100644 --- a/benchmark/benchmark_device_adjacent_difference.parallel.cpp.in +++ b/benchmark/benchmark_device_adjacent_difference.parallel.cpp.in @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -26,9 +26,11 @@ #include "benchmark_device_adjacent_difference.parallel.hpp" namespace { - auto benchmarks = config_autotune_register::create>>(); + @InPlace@>::create); + } diff --git a/benchmark/benchmark_device_adjacent_difference.parallel.hpp b/benchmark/benchmark_device_adjacent_difference.parallel.hpp index b74de4050..51f8cdfda 100644 --- a/benchmark/benchmark_device_adjacent_difference.parallel.hpp +++ b/benchmark/benchmark_device_adjacent_difference.parallel.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -39,21 +39,36 @@ #include "benchmark_utils.hpp" -template> +template +std::string config_name() +{ + //const rocprim::adjacent_difference_config = Config(); + auto config = Config(); + return "{bs:" + std::to_string(config.block_size) + + ",ipt:" + std::to_string(config.items_per_thread) + "}"; +} + +template<> +inline std::string config_name() +{ + return "default_config"; +} + +template struct device_adjacent_difference_benchmark : public config_autotune_interface { + std::string name() const override { + using namespace std::string_literals; - return bench_naming::format_name( - "{lvl:device,algo:adjacent_difference" + (left ? ""s : "_right"s) - + (in_place ? "_inplace"s : ""s) + ",key_type:" + std::string(Traits::name()) - + ",cfg:{bs:" + std::to_string(Config::block_size) - + ",ipt:" + std::to_string(Config::items_per_thread) + "}}"); + return bench_naming::format_name("{lvl:device,algo:adjacent_difference" + + (Left ? ""s : "_right"s) + (InPlace ? "_inplace"s : ""s) + + ",value_type:" + std::string(Traits::name()) + + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; @@ -84,11 +99,11 @@ struct device_adjacent_difference_benchmark : public config_autotune_interface const OutputIt output, Args&&... args) const { - return ::rocprim::adjacent_difference_right(temporary_storage, - storage_size, - input, - output, - std::forward(args)...); + return ::rocprim::adjacent_difference_right(temporary_storage, + storage_size, + input, + output, + std::forward(args)...); } template @@ -140,13 +155,13 @@ struct device_adjacent_difference_benchmark : public config_autotune_interface input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); - if(!in_place) + if(!InPlace) { HIP_CHECK(hipMalloc(&d_output, size * sizeof(output_type))); } - static constexpr auto left_tag = rocprim::detail::bool_constant{}; - static constexpr auto in_place_tag = rocprim::detail::bool_constant{}; + static constexpr auto left_tag = rocprim::detail::bool_constant{}; + static constexpr auto in_place_tag = rocprim::detail::bool_constant{}; // Allocate temporary storage std::size_t temp_storage_size; @@ -208,7 +223,7 @@ struct device_adjacent_difference_benchmark : public config_autotune_interface state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); - if(!in_place) + if(!InPlace) { hipFree(d_output); } @@ -216,4 +231,34 @@ struct device_adjacent_difference_benchmark : public config_autotune_interface } }; +template +struct device_adjacent_difference_benchmark_generator +{ + + template + struct create_ipt + { + using generated_config + = rocprim::adjacent_difference_config; + + void operator()(std::vector>& storage) + { + storage.emplace_back( + std::make_unique< + device_adjacent_difference_benchmark>()); + } + }; + + static void create(std::vector>& storage) + { + static constexpr unsigned int min_items_per_thread = 1; + static constexpr unsigned int max_items_per_thread_arg + = TUNING_SHARED_MEMORY_MAX / (BlockSize * sizeof(T) * 2 + sizeof(T)); + static constexpr unsigned int max_items_per_thread + = rocprim::Log2::VALUE - 1; + static_for_each, + create_ipt>(storage); + } +}; + #endif // ROCPRIM_BENCHMARK_DEVICE_ADJACENT_DIFFERENCE_PARALLEL_HPP_ diff --git a/benchmark/benchmark_device_binary_search.cpp b/benchmark/benchmark_device_binary_search.cpp index 242619b8c..77353b6a1 100644 --- a/benchmark/benchmark_device_binary_search.cpp +++ b/benchmark/benchmark_device_binary_search.cpp @@ -40,6 +40,7 @@ #include #include "benchmark_device_binary_search.parallel.hpp" +#include "rocprim/device/config_types.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; @@ -96,32 +97,33 @@ void run_benchmark(benchmark::State& state, void * d_temporary_storage = nullptr; size_t temporary_storage_bytes; - HIP_CHECK(dispatch_binary_search(AlgorithmSelectorTag{}, - d_temporary_storage, - temporary_storage_bytes, - d_haystack, - d_needles, - d_output, - haystack_size, - needles_size, - compare_op, - stream)); + auto dispatch_helper = dispatch_binary_search_helper(); + HIP_CHECK(dispatch_helper.dispatch_binary_search(AlgorithmSelectorTag{}, + d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + compare_op, + stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { - HIP_CHECK(dispatch_binary_search(AlgorithmSelectorTag{}, - d_temporary_storage, - temporary_storage_bytes, - d_haystack, - d_needles, - d_output, - haystack_size, - needles_size, - compare_op, - stream)); + HIP_CHECK(dispatch_helper.dispatch_binary_search(AlgorithmSelectorTag{}, + d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + compare_op, + stream)); } HIP_CHECK(hipDeviceSynchronize()); @@ -137,16 +139,16 @@ void run_benchmark(benchmark::State& state, for(size_t i = 0; i < batch_size; i++) { - HIP_CHECK(dispatch_binary_search(AlgorithmSelectorTag{}, - d_temporary_storage, - temporary_storage_bytes, - d_haystack, - d_needles, - d_output, - haystack_size, - needles_size, - compare_op, - stream)); + HIP_CHECK(dispatch_helper.dispatch_binary_search(AlgorithmSelectorTag{}, + d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + compare_op, + stream)); } // Record stop event and wait until it completes diff --git a/benchmark/benchmark_device_binary_search.parallel.hpp b/benchmark/benchmark_device_binary_search.parallel.hpp index fdc3d5d27..cb9ff3c79 100644 --- a/benchmark/benchmark_device_binary_search.parallel.hpp +++ b/benchmark/benchmark_device_binary_search.parallel.hpp @@ -28,6 +28,8 @@ #include #include "benchmark_utils.hpp" +#include "rocprim/device/config_types.hpp" +#include "rocprim/device/detail/device_config_helper.hpp" #include #include #include @@ -56,23 +58,52 @@ struct upper_bound_subalgorithm } }; -template -hipError_t dispatch_binary_search(binary_search_subalgorithm, Args&&... args) +template +struct dispatch_binary_search_helper { - return rocprim::binary_search(std::forward(args)...); -} + template + hipError_t dispatch_binary_search(binary_search_subalgorithm, Args&&... args) + { + using config = rocprim::binary_search_config; + return rocprim::binary_search(std::forward(args)...); + } -template -hipError_t dispatch_binary_search(upper_bound_subalgorithm, Args&&... args) -{ - return rocprim::upper_bound(std::forward(args)...); -} + template + hipError_t dispatch_binary_search(upper_bound_subalgorithm, Args&&... args) + { + using config = rocprim::upper_bound_config; + return rocprim::upper_bound(std::forward(args)...); + } + + template + hipError_t dispatch_binary_search(lower_bound_subalgorithm, Args&&... args) + { + using config = rocprim::lower_bound_config; + return rocprim::lower_bound(std::forward(args)...); + } +}; -template -hipError_t dispatch_binary_search(lower_bound_subalgorithm, Args&&... args) +template<> +struct dispatch_binary_search_helper { - return rocprim::lower_bound(std::forward(args)...); -} + template + hipError_t dispatch_binary_search(binary_search_subalgorithm, Args&&... args) + { + return rocprim::binary_search(std::forward(args)...); + } + + template + hipError_t dispatch_binary_search(upper_bound_subalgorithm, Args&&... args) + { + return rocprim::upper_bound(std::forward(args)...); + } + + template + hipError_t dispatch_binary_search(lower_bound_subalgorithm, Args&&... args) + { + return rocprim::lower_bound(std::forward(args)...); + } +}; template struct device_binary_search_benchmark : public config_autotune_interface @@ -116,30 +147,31 @@ struct device_binary_search_benchmark : public config_autotune_interface void* d_temporary_storage = nullptr; size_t temporary_storage_bytes; - HIP_CHECK(dispatch_binary_search(SubAlgorithm{}, - d_temporary_storage, - temporary_storage_bytes, - d_haystack, - d_needles, - d_output, - haystack_size, - needles_size, - compare_op, - stream)); + auto dispatch_helper = dispatch_binary_search_helper(); + HIP_CHECK(dispatch_helper.dispatch_binary_search(SubAlgorithm{}, + d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + compare_op, + stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up - HIP_CHECK(dispatch_binary_search(SubAlgorithm{}, - d_temporary_storage, - temporary_storage_bytes, - d_haystack, - d_needles, - d_output, - haystack_size, - needles_size, - compare_op, - stream)); + HIP_CHECK(dispatch_helper.dispatch_binary_search(SubAlgorithm{}, + d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + compare_op, + stream)); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation @@ -152,16 +184,16 @@ struct device_binary_search_benchmark : public config_autotune_interface // Record start event HIP_CHECK(hipEventRecord(start, stream)); - HIP_CHECK(dispatch_binary_search(SubAlgorithm{}, - d_temporary_storage, - temporary_storage_bytes, - d_haystack, - d_needles, - d_output, - haystack_size, - needles_size, - compare_op, - stream)); + HIP_CHECK(dispatch_helper.dispatch_binary_search(SubAlgorithm{}, + d_temporary_storage, + temporary_storage_bytes, + d_haystack, + d_needles, + d_output, + haystack_size, + needles_size, + compare_op, + stream)); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); diff --git a/benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp b/benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp index e744eb060..57ca12ae3 100644 --- a/benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp +++ b/benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp @@ -59,8 +59,7 @@ std::string config_name() { const rocprim::detail::merge_sort_block_sort_config_params config = Config(); return "{bs:" + std::to_string(config.block_sort_config.block_size) - + ",ipt:" + std::to_string(config.block_sort_config.items_per_thread) - + ",method:" + std::string(get_block_sort_method_name(config.block_sort_method)) + "}"; + + ",ipt:" + std::to_string(config.block_sort_config.items_per_thread) + "}"; } template<> diff --git a/benchmark/benchmark_device_scan.parallel.hpp b/benchmark/benchmark_device_scan.parallel.hpp index 9eb3b59a1..4f976d4c4 100644 --- a/benchmark/benchmark_device_scan.parallel.hpp +++ b/benchmark/benchmark_device_scan.parallel.hpp @@ -222,12 +222,12 @@ struct device_scan_benchmark_generator { void operator()(std::vector>& storage) { - storage.emplace_back(std::make_unique, - rocprim::scan_config_v2< - block_size, + storage.emplace_back( + std::make_unique, + rocprim::scan_config, rocprim::equal_to, 1024, - rocprim::scan_by_key_config_v2< + rocprim::scan_by_key_config< block_size, ItemsPerThread, rocprim::block_load_method::block_load_transpose, diff --git a/benchmark/benchmark_utils.hpp b/benchmark/benchmark_utils.hpp index 93931a2f3..2d0c578f4 100644 --- a/benchmark/benchmark_utils.hpp +++ b/benchmark/benchmark_utils.hpp @@ -261,9 +261,11 @@ inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = return data; } -inline bool is_warp_size_supported(const unsigned int required_warp_size) +inline bool is_warp_size_supported(const unsigned int required_warp_size, const int device_id) { - return ::rocprim::host_warp_size() >= required_warp_size; + unsigned int warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, warp_size)); + return warp_size >= required_warp_size; } template diff --git a/benchmark/benchmark_warp_exchange.cpp b/benchmark/benchmark_warp_exchange.cpp index 3fd58f5c3..2abbe389d 100644 --- a/benchmark/benchmark_warp_exchange.cpp +++ b/benchmark/benchmark_warp_exchange.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -350,7 +350,9 @@ int main(int argc, char *argv[]) CREATE_BENCHMARK(int, 256, 16, 32, ScatterToStripedOp) }; - if(is_warp_size_supported(64)) + int hip_device = 0; + HIP_CHECK(::rocprim::detail::get_device_from_stream(stream, hip_device)); + if(is_warp_size_supported(64, hip_device)) { std::vector additional_benchmarks{ CREATE_BENCHMARK(int, 256, 1, 64, BlockedToStripedOp), diff --git a/docs/device_ops/sort.rst b/docs/device_ops/sort.rst index dcaf2778d..a0674e59c 100644 --- a/docs/device_ops/sort.rst +++ b/docs/device_ops/sort.rst @@ -7,7 +7,7 @@ Configuring the kernel merge_sort .......... -.. doxygentypedef:: rocprim::merge_sort_config +.. doxygenstruct:: rocprim::merge_sort_config radix_sort .......... diff --git a/docs/device_ops/transform.rst b/docs/device_ops/transform.rst index a401140bf..b38eb45bc 100644 --- a/docs/device_ops/transform.rst +++ b/docs/device_ops/transform.rst @@ -4,7 +4,7 @@ Transform Configuring the kernel ~~~~~~~~~~~~~~~~~~~~~~ -.. doxygentypedef:: rocprim::transform_config +.. doxygenstruct:: rocprim::transform_config transform ~~~~~~~~~ diff --git a/docs/intrinsics.rst b/docs/intrinsics.rst index dc5ae191c..d4b379427 100644 --- a/docs/intrinsics.rst +++ b/docs/intrinsics.rst @@ -12,7 +12,8 @@ Warp size --------- .. doxygenfunction:: rocprim::warp_size() -.. doxygenfunction:: rocprim::host_warp_size() +.. doxygenfunction:: rocprim::host_warp_size(const int device_id, unsigned int& warp_size) +.. doxygenfunction:: rocprim::host_warp_size(const hipStream_t stream, unsigned int& warp_size) .. doxygenfunction:: rocprim::device_warp_size() Lane and Warp ID diff --git a/rmake.py b/rmake.py index e3d0816cb..265577f1d 100644 --- a/rmake.py +++ b/rmake.py @@ -101,6 +101,12 @@ def config_cmd(): #set CPACK_PACKAGING_INSTALL_PREFIX= defined as blank as it is appended to end of path for archive creation cmake_platform_opts.append( f"-DWIN32=ON -DCPACK_PACKAGING_INSTALL_PREFIX=") #" -DCPACK_PACKAGING_INSTALL_PREFIX={rocm_path}" cmake_platform_opts.append( f"-DCMAKE_INSTALL_PREFIX=\"C:/hipSDK\"" ) + + # MSVC requires acknowledgement of using extended aligned storage. + # Before VS 2017 15.8, has non-conforming alignment. VS 2017 15.8 fixes this, but inherently changes layouts of + # aligned storage with extended alignment, and thus binary compatibility with such types. + cmake_platform_opts.append( "-DCMAKE_CXX_FLAGS=\"-D_ENABLE_EXTENDED_ALIGNED_STORAGE\"") + rocm_cmake_path = '"' + cmake_path(os.getenv("ROCM_CMAKE_PATH", "C:/hipSDK")) + '"' generator = f"-G Ninja" # "-G \"Visual Studio 16 2019\" -A x64" # -G NMake ") # diff --git a/rocprim/include/rocprim/block/block_radix_sort.hpp b/rocprim/include/rocprim/block/block_radix_sort.hpp index f94deb0e0..71c1f37f0 100644 --- a/rocprim/include/rocprim/block/block_radix_sort.hpp +++ b/rocprim/include/rocprim/block/block_radix_sort.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -884,6 +884,18 @@ class block_radix_sort } }; +#ifndef DOXYGEN_SHOULD_SKIP_THIS +template +constexpr unsigned int + block_radix_sort:: + radix_bits_per_pass; +#endif + END_ROCPRIM_NAMESPACE /// @} diff --git a/rocprim/include/rocprim/block/block_run_length_decode.hpp b/rocprim/include/rocprim/block/block_run_length_decode.hpp new file mode 100644 index 000000000..98b3c2515 --- /dev/null +++ b/rocprim/include/rocprim/block/block_run_length_decode.hpp @@ -0,0 +1,374 @@ +/****************************************************************************** + * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2021-2023, Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#ifndef ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_ +#define ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_ + +#include "../block/block_scan.hpp" +#include "../config.hpp" +#include "../detail/temp_storage.hpp" +#include "../detail/various.hpp" +#include "../functional.hpp" +#include "../intrinsics/thread.hpp" +#include "../thread/thread_search.hpp" + +BEGIN_ROCPRIM_NAMESPACE + +/** + * \brief The block_run_length_decode class supports decoding a run-length encoded array of items. That is, given + * the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output + * array. + * Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded + * array is runtime-dependent and potentially without any upper bound. To address this, block_run_length_decode allows + * retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS * + * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned. + * + * \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). + * A run of length zero may not be followed by a run length that is not zero. + * + * \par + * \code + * __global__ void ExampleKernel(...) + * { + * // Specialising block_run_length_decode to run-length decode items of type uint64_t + * using RunItemT = uint64_t; + * // Type large enough to index into the run-length decoded array + * using RunLengthT = uint32_t; + * + * // Specialising block_run_length_decode for a 1D block of 128 threads + * constexpr int BLOCK_DIM_X = 128; + * // Specialising block_run_length_decode to have each thread contribute 2 run-length encoded runs + * constexpr int RUNS_PER_THREAD = 2; + * // Specialising block_run_length_decode to have each thread hold 4 run-length decoded items + * constexpr int DECODED_ITEMS_PER_THREAD = 4; + * + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each + * using block_run_length_decodeT = + * hipcub::block_run_length_decode; + * + * // Allocate shared memory for block_run_length_decode + * __shared__ typename block_run_length_decodeT::TempStorage temp_storage; + * + * // The run-length encoded items and how often they shall be repeated in the run-length decoded output + * RunItemT run_values[RUNS_PER_THREAD]; + * RunLengthT run_lengths[RUNS_PER_THREAD]; + * ... + * + * // Initialize the block_run_length_decode with the runs that we want to run-length decode + * uint32_t total_decoded_size = 0; + * block_run_length_decodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size); + * + * // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs + * // have been decoded. + * uint32_t decoded_window_offset = 0U; + * while (decoded_window_offset < total_decoded_size) + * { + * RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD]; + * RunItemT decoded_items[DECODED_ITEMS_PER_THREAD]; + * + * // The number of decoded items that are valid within this window (aka pass) of run-length decoding + * uint32_t num_valid_items = total_decoded_size - decoded_window_offset; + * block_rld.run_length_decode(decoded_items, relative_offsets, decoded_window_offset); + * + * decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD; + * + * ... + * } + * } + * \endcode + * \par + * Suppose the set of input \p run_values across the block of threads is + * { [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] } and + * \p run_lengths is { [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }. + * The corresponding output \p decoded_items in those threads will be { [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], + * [4, 4, 4, 5], ..., [169, 169, 170, 171] } and \p relative_offsets will be { [0, 0, 1, 0], [1, 2, 0, 1], [2, + * 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] } during the first iteration of the while loop. + * + * \tparam ItemT The data type of the items being run-length decoded + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes + * \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds + * \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the + * runs' lengths) + * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension + * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension + */ +template +class block_run_length_decode +{ +private: + /// The thread block size in threads + static constexpr int BLOCK_THREADS = BlockSizeX * BlockSizeY * BlockSizeZ; + + /// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0') + static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD; + + /// block_scan used to determine the beginning of each run (i.e., prefix sum over the runs' length) + using block_scan_type = rocprim::block_scan; + + /// Type used to index into the block's runs + using RunOffsetT = uint32_t; + + /// Shared memory type required by this thread block + union storage_type_ + { + typename block_scan_type::storage_type offset_scan; + struct + { + ItemT run_values[BLOCK_RUNS]; + DecodedOffsetT run_offsets[BLOCK_RUNS]; + } runs; + }; + + ROCPRIM_DEVICE ROCPRIM_INLINE storage_type_& private_storage() + { + ROCPRIM_SHARED_MEMORY storage_type private_storage; + return private_storage.get(); + } + + storage_type_& temp_storage; + + uint32_t linear_tid; + +public: + /// \brief Struct used to allocate a temporary memory that is required for thread + /// communication during operations provided by related parallel primitive. + /// + /// Depending on the implemention the operations exposed by parallel primitive may + /// require a temporary storage for thread communication. The storage should be allocated + /// using keywords __shared__. It can be aliased to + /// an externally allocated memory, or be a part of a union type with other storage types + /// to increase shared memory reusability. + using storage_type = detail::raw_storage; + + /** + * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The + * algorithm's temporary storage may not be repurposed between the constructor call and subsequent + * run_length_decode calls. + */ + template + ROCPRIM_DEVICE ROCPRIM_INLINE + block_run_length_decode(storage_type& temp_storage, + ItemT (&run_values)[RUNS_PER_THREAD], + RunLengthT (&run_lengths)[RUNS_PER_THREAD], + TotalDecodedSizeT& total_decoded_size) + : temp_storage(temp_storage.get()) + , linear_tid(::rocprim::flat_block_thread_id()) + { + init_with_run_lengths(run_values, run_lengths, total_decoded_size); + } + + /** + * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The + * algorithm's temporary storage may not be repurposed between the constructor call and subsequent + * run_length_decode calls. + */ + template + ROCPRIM_DEVICE ROCPRIM_INLINE + block_run_length_decode(storage_type& temp_storage, + ItemT (&run_values)[RUNS_PER_THREAD], + UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD]) + : temp_storage(temp_storage.get()) + , linear_tid(::rocprim::flat_block_thread_id()) + { + init_with_run_offsets(run_values, run_offsets); + } + + /** + * \brief Constructor specialised for static temporary storage, initializing using the runs' lengths. + */ + template + ROCPRIM_DEVICE ROCPRIM_INLINE + block_run_length_decode(ItemT (&run_values)[RUNS_PER_THREAD], + RunLengthT (&run_lengths)[RUNS_PER_THREAD], + TotalDecodedSizeT& total_decoded_size) + : temp_storage(private_storage()) + , linear_tid(::rocprim::flat_block_thread_id()) + { + init_with_run_lengths(run_values, run_lengths, total_decoded_size); + } + + /** + * \brief Constructor specialised for static temporary storage, initializing using the runs' offsets. + */ + template + ROCPRIM_DEVICE ROCPRIM_INLINE + block_run_length_decode(ItemT (&run_values)[RUNS_PER_THREAD], + UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD]) + : temp_storage(private_storage()) + , linear_tid(::rocprim::flat_block_thread_id()) + { + init_with_run_offsets(run_values, run_offsets); + } + +private: + template + ROCPRIM_DEVICE ROCPRIM_INLINE void + init_with_run_offsets(ItemT (&run_values)[RUNS_PER_THREAD], + RunOffsetT (&run_offsets)[RUNS_PER_THREAD]) + { + // Keep the runs' items and the offsets of each run's beginning in the temporary storage + RunOffsetT thread_dst_offset + = static_cast(linear_tid) * static_cast(RUNS_PER_THREAD); + +#pragma unroll + for(int i = 0; i < RUNS_PER_THREAD; ++i, ++thread_dst_offset) + { + temp_storage.runs.run_values[thread_dst_offset] = run_values[i]; + temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i]; + } + + // Ensure run offsets and run values have been writen to shared memory + syncthreads(); + } + + template + ROCPRIM_DEVICE ROCPRIM_INLINE void + init_with_run_lengths(ItemT (&run_values)[RUNS_PER_THREAD], + RunLengthT (&run_lengths)[RUNS_PER_THREAD], + TotalDecodedSizeT& total_decoded_size) + { + // Compute the offset for the beginning of each run + DecodedOffsetT run_offsets[RUNS_PER_THREAD]; +#pragma unroll + for(int i = 0; i < RUNS_PER_THREAD; ++i) + { + run_offsets[i] = static_cast(run_lengths[i]); + } + + DecodedOffsetT decoded_size_aggregate{}; + block_scan_type().exclusive_scan(run_offsets, + run_offsets, + 0, + decoded_size_aggregate, + temp_storage.offset_scan, + rocprim::plus{}); + total_decoded_size = static_cast(decoded_size_aggregate); + + // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation) + syncthreads(); + + init_with_run_offsets(run_values, run_offsets); + } + +public: + /** + * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded + * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the + * run-length decode buffer (i.e., DECODED_ITEMS_PER_THREAD * BLOCK_THREADS), only the items that fit within + * the buffer are returned. Subsequent calls to run_length_decode adjusting \p from_decoded_offset can be + * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to + * run_length_decode is not required. + * \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the + * run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length + * decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`. + * + * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement + * \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to + * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results + * in undefined behavior. + */ + template + ROCPRIM_DEVICE ROCPRIM_INLINE void + run_length_decode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], + RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD], + DecodedOffsetT from_decoded_offset = 0) + { + // The (global) offset of the first item decoded by this thread + DecodedOffsetT thread_decoded_offset + = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD; + + // The run that the first decoded item of this thread belongs to + // If this thread's is already beyond the total decoded size, it will be assigned to the + // last run + RunOffsetT current_run + = rocprim::static_upper_bound(temp_storage.runs.run_offsets, + BLOCK_RUNS, + thread_decoded_offset) + - static_cast(1U); + + // Set the current_run_end to thread_decoded_offset to trigger new run branch in the first iteration + DecodedOffsetT current_run_begin, current_run_end = thread_decoded_offset; + + ItemT val{}; + +#pragma unroll + for(DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; ++i, ++thread_decoded_offset) + { + // If we are in a new run... + if(thread_decoded_offset == current_run_end) + { + // The value of the new run + val = temp_storage.runs.run_values[current_run]; + + // The run bounds + current_run_begin = temp_storage.runs.run_offsets[current_run]; + current_run_end = temp_storage.runs.run_offsets[++current_run]; + } + + // Decode the current run by storing the run's value + decoded_items[i] = val; + item_offsets[i] = thread_decoded_offset - current_run_begin; + } + } + + /** + * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded + * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the + * run-length decode buffer (i.e., DECODED_ITEMS_PER_THREAD * BLOCK_THREADS), only the items that fit within + * the buffer are returned. Subsequent calls to run_length_decode adjusting \p from_decoded_offset can be + * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to + * run_length_decode is not required. + * + * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement + * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results + * in undefined behavior. + */ + ROCPRIM_DEVICE ROCPRIM_INLINE void + run_length_decode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], + DecodedOffsetT from_decoded_offset = 0) + { + DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD]; + run_length_decode(decoded_items, item_offsets, from_decoded_offset); + } +}; + +END_ROCPRIM_NAMESPACE + +#endif diff --git a/rocprim/include/rocprim/block/block_shuffle.hpp b/rocprim/include/rocprim/block/block_shuffle.hpp index 3a6e5abc3..50865e19e 100644 --- a/rocprim/include/rocprim/block/block_shuffle.hpp +++ b/rocprim/include/rocprim/block/block_shuffle.hpp @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2021-2023, Advanced Micro Devices, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -35,11 +35,11 @@ #include "../config.hpp" #include "../detail/various.hpp" -#include "../intrinsics.hpp" #include "../functional.hpp" +#include "../intrinsics.hpp" -#include "detail/block_reduce_warp_reduce.hpp" #include "detail/block_reduce_raking_reduce.hpp" +#include "detail/block_reduce_warp_reduce.hpp" /// \addtogroup blockmodule /// @{ @@ -87,11 +87,7 @@ BEGIN_ROCPRIM_NAMESPACE /// } /// \endcode /// \endparblock -template< - class T, - unsigned int BlockSizeX, - unsigned int BlockSizeY = 1, - unsigned int BlockSizeZ = 1> +template class block_shuffle { static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ; @@ -99,25 +95,23 @@ class block_shuffle // Struct used for creating a raw_storage object for this primitive's temporary storage. struct storage_type_ { - T prev[BlockSize]; - T next[BlockSize]; + T buffer[BlockSize]; }; public: - - /// \brief Struct used to allocate a temporary memory that is required for thread - /// communication during operations provided by related parallel primitive. - /// - /// Depending on the implemention the operations exposed by parallel primitive may - /// require a temporary storage for thread communication. The storage should be allocated - /// using keywords __shared__. It can be aliased to - /// an externally allocated memory, or be a part of a union type with other storage types - /// to increase shared memory reusability. - #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen - using storage_type = detail::raw_storage; - #else - using storage_type = storage_type_; // only for Doxygen - #endif +/// \brief Struct used to allocate a temporary memory that is required for thread +/// communication during operations provided by related parallel primitive. +/// +/// Depending on the implemention the operations exposed by parallel primitive may +/// require a temporary storage for thread communication. The storage should be allocated +/// using keywords __shared__. It can be aliased to +/// an externally allocated memory, or be a part of a union type with other storage types +/// to increase shared memory reusability. +#ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen + using storage_type = detail::raw_storage; +#else + using storage_type = storage_type_; // only for Doxygen +#endif /// \brief Shuffles data across threads in a block, offseted by the distance value. /// @@ -144,15 +138,12 @@ class block_shuffle /// ... /// } /// \endcode - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void offset(T input, - T& output, - int distance = 1) + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void offset(T input, T& output, int distance = 1) { - offset( - ::rocprim::flat_block_thread_id(), - input, output, distance - ); + offset(::rocprim::flat_block_thread_id(), + input, + output, + distance); } /// \brief Shuffles data across threads in a block, offseted by the distance value. @@ -164,11 +155,8 @@ class block_shuffle /// \param [in] input - input data to be shuffled to another thread. /// \param [out] output - reference to a output value, that receives data from another thread /// \param [in] distance - The input threadId + distance = output threadId. - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void offset(const size_t& flat_id, - T input, - T& output, - int distance) + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void + offset(const size_t& flat_id, T input, T& output, int distance) { ROCPRIM_SHARED_MEMORY storage_type storage; offset(flat_id, input, output, distance, storage); @@ -184,22 +172,19 @@ class block_shuffle /// \param [out] output - reference to a output value, that receives data from another thread /// \param [in] distance - The input threadId + distance = output threadId. /// \param [in] storage - reference to a temporary storage object of type storage_type. - ROCPRIM_DEVICE ROCPRIM_INLINE - void offset(const size_t& flat_id, - T input, - T& output, - int distance, - storage_type& storage) + ROCPRIM_DEVICE ROCPRIM_INLINE void + offset(const size_t& flat_id, T input, T& output, int distance, storage_type& storage) { - storage_type_& storage_ = storage.get(); - storage_.prev[flat_id] = input; + storage_type_& storage_ = storage.get(); + storage_.buffer[flat_id] = input; + + const int offset_tid = static_cast(flat_id) + distance; ::rocprim::syncthreads(); - const int offset_tid = static_cast(flat_id) + distance; - if ((offset_tid >= 0) && (offset_tid < (int)BlockSize)) + if((offset_tid >= 0) && (offset_tid < (int)BlockSize)) { - output = storage_.prev[static_cast(offset_tid)]; + output = storage_.buffer[static_cast(offset_tid)]; } } @@ -210,7 +195,7 @@ class block_shuffle /// /// \param [in] input - input data to be shuffled to another thread. /// \param [out] output - reference to a output value, that receives data from another thread - /// \param [in] distance - The input threadId + distance = output threadId. + /// \param [in] distance - The input threadId + distance = output threadId. Distance magnitude should be <= BlockSize. /// /// \par Example. /// \code{.cpp} @@ -228,15 +213,12 @@ class block_shuffle /// ... /// } /// \endcode - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void rotate(T input, - T& output, - unsigned int distance = 1) + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void rotate(T input, T& output, int distance = 1) { - rotate( - ::rocprim::flat_block_thread_id(), - input, output, distance - ); + rotate(::rocprim::flat_block_thread_id(), + input, + output, + distance); } /// \brief Shuffles data across threads in a block, offseted by the distance value. @@ -248,11 +230,8 @@ class block_shuffle /// \param [in] input - input data to be shuffled to another thread. /// \param [out] output - reference to a output value, that receives data from another thread /// \param [in] distance - The input threadId + distance = output threadId. - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void rotate(const size_t& flat_id, - T input, - T& output, - unsigned int distance) + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void + rotate(const size_t& flat_id, T input, T& output, int distance) { ROCPRIM_SHARED_MEMORY storage_type storage; rotate(flat_id, input, output, distance, storage); @@ -268,25 +247,26 @@ class block_shuffle /// \param [out] output - reference to a output value, that receives data from another thread /// \param [in] distance - The input threadId + distance = output threadId. /// \param [in] storage - reference to a temporary storage object of type storage_type. - ROCPRIM_DEVICE ROCPRIM_INLINE - void rotate(const size_t& flat_id, - T input, - T& output, - unsigned int distance, - storage_type& storage) + ROCPRIM_DEVICE ROCPRIM_INLINE void + rotate(const size_t& flat_id, T input, T& output, int distance, storage_type& storage) { - storage_type_& storage_ = storage.get(); - storage_.prev[flat_id] = input; + storage_type_& storage_ = storage.get(); + storage_.buffer[flat_id] = input; - ::rocprim::syncthreads(); - - unsigned int offset = threadIdx.x + distance; - if (offset >= BlockSize) + int offset = static_cast(flat_id) + distance; + if(offset >= (int)BlockSize) + { offset -= BlockSize; + } + else if(offset < 0) + { + offset += BlockSize; + } - output = storage_.prev[offset]; - } + ::rocprim::syncthreads(); + output = storage_.buffer[offset]; + } /// \brief The thread block rotates a blocked arrange of input items, /// shifting it up by one item @@ -311,15 +291,13 @@ class block_shuffle /// ... /// } /// \endcode - template - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void up(T (&input)[ItemsPerThread], - T (&prev)[ItemsPerThread]) + template + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(T (&input)[ItemsPerThread], + T (&prev)[ItemsPerThread]) { - this->up( - ::rocprim::flat_block_thread_id(), - input, prev - ); + this->up(::rocprim::flat_block_thread_id(), + input, + prev); } /// \brief The thread block rotates a blocked arrange of input items, @@ -329,11 +307,9 @@ class block_shuffle /// \param [in] input - The calling thread's input items /// \param [out] prev - The corresponding predecessor items (may be aliased to \p input). /// The item \p prev[0] is not updated for thread0. - template - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void up(const size_t& flat_id, - T (&input)[ItemsPerThread], - T (&prev)[ItemsPerThread]) + template + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void + up(const size_t& flat_id, T (&input)[ItemsPerThread], T (&prev)[ItemsPerThread]) { ROCPRIM_SHARED_MEMORY storage_type storage; this->up(flat_id, input, prev, storage); @@ -347,31 +323,29 @@ class block_shuffle /// \param [out] prev - The corresponding predecessor items (may be aliased to \p input). /// \param [in] storage - reference to a temporary storage object of type storage_type. /// The item \p prev[0] is not updated for thread0. - template - ROCPRIM_DEVICE ROCPRIM_INLINE - void up(const size_t& flat_id, - T (&input)[ItemsPerThread], - T (&prev)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t& flat_id, + T (&input)[ItemsPerThread], + T (&prev)[ItemsPerThread], + storage_type& storage) { - storage_type_& storage_ = storage.get(); - storage_.prev[flat_id] = input[ItemsPerThread -1]; - - ::rocprim::syncthreads(); + storage_type_& storage_ = storage.get(); + storage_.buffer[flat_id] = input[ItemsPerThread - 1]; ROCPRIM_UNROLL - for (unsigned int i = ItemsPerThread - 1; i > 0; --i) + for(unsigned int i = ItemsPerThread - 1; i > 0; --i) { prev[i] = input[i - 1]; } - if (flat_id > 0) + ::rocprim::syncthreads(); + + if(flat_id > 0) { - prev[0] = storage_.prev[flat_id - 1]; + prev[0] = storage_.buffer[flat_id - 1]; } } - /// \brief The thread block rotates a blocked arrange of input items, /// shifting it up by one item /// @@ -380,16 +354,14 @@ class block_shuffle /// The item \p prev[0] is not updated for thread0. /// \param [out] block_suffix - The item \p input[ItemsPerThread-1] from /// threadBlockSize-1, provided to all threads - template - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void up(T (&input)[ItemsPerThread], - T (&prev)[ItemsPerThread], - T &block_suffix) + template + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void + up(T (&input)[ItemsPerThread], T (&prev)[ItemsPerThread], T& block_suffix) { - this->up( - ::rocprim::flat_block_thread_id(), - input, prev, block_suffix - ); + this->up(::rocprim::flat_block_thread_id(), + input, + prev, + block_suffix); } /// \brief The thread block rotates a blocked arrange of input items, @@ -401,12 +373,11 @@ class block_shuffle /// The item \p prev[0] is not updated for thread0. /// \param [out] block_suffix - The item \p input[ItemsPerThread-1] from /// threadBlockSize-1, provided to all threads - template - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void up(const size_t& flat_id, - T (&input)[ItemsPerThread], - T (&prev)[ItemsPerThread], - T &block_suffix) + template + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(const size_t& flat_id, + T (&input)[ItemsPerThread], + T (&prev)[ItemsPerThread], + T& block_suffix) { ROCPRIM_SHARED_MEMORY storage_type storage; this->up(flat_id, input, prev, block_suffix, storage); @@ -422,18 +393,17 @@ class block_shuffle /// \param [out] block_suffix - The item \p input[ItemsPerThread-1] from /// threadBlockSize-1, provided to all threads /// \param [in] storage - reference to a temporary storage object of type storage_type. - template - ROCPRIM_DEVICE ROCPRIM_INLINE - void up(const size_t& flat_id, - T (&input)[ItemsPerThread], - T (&prev)[ItemsPerThread], - T &block_suffix, - storage_type& storage) + template + ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t& flat_id, + T (&input)[ItemsPerThread], + T (&prev)[ItemsPerThread], + T& block_suffix, + storage_type& storage) { up(flat_id, input, prev, storage); // Update block prefix - block_suffix = storage->prev[BlockSize - 1]; + block_suffix = storage->buffer[BlockSize - 1]; } /// \brief The thread block rotates a blocked arrange of input items, @@ -459,15 +429,13 @@ class block_shuffle /// ... /// } /// \endcode - template - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void down(T (&input)[ItemsPerThread], - T (&next)[ItemsPerThread]) + template + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(T (&input)[ItemsPerThread], + T (&next)[ItemsPerThread]) { - this->down( - ::rocprim::flat_block_thread_id(), - input, next - ); + this->down(::rocprim::flat_block_thread_id(), + input, + next); } /// \brief The thread block rotates a blocked arrange of input items, @@ -477,11 +445,9 @@ class block_shuffle /// \param [in] input - The calling thread's input items /// \param [out] next - The corresponding successor items (may be aliased to \p input). /// The item \p prev[0] is not updated for threadBlockSize - 1. - template - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void down(const size_t& flat_id, - T (&input)[ItemsPerThread], - T (&next)[ItemsPerThread]) + template + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void + down(const size_t& flat_id, T (&input)[ItemsPerThread], T (&next)[ItemsPerThread]) { ROCPRIM_SHARED_MEMORY storage_type storage; this->down(flat_id, input, next, storage); @@ -495,27 +461,26 @@ class block_shuffle /// \param [out] next - The corresponding successor items (may be aliased to \p input). /// The item \p prev[0] is not updated for threadBlockSize - 1. /// \param [in] storage - reference to a temporary storage object of type storage_type. - template - ROCPRIM_DEVICE ROCPRIM_INLINE - void down(const size_t& flat_id, - T (&input)[ItemsPerThread], - T (&next)[ItemsPerThread], - storage_type& storage) + template + ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t& flat_id, + T (&input)[ItemsPerThread], + T (&next)[ItemsPerThread], + storage_type& storage) { - storage_type_& storage_ = storage.get(); - storage_.next[flat_id] = input[0]; - - ::rocprim::syncthreads(); + storage_type_& storage_ = storage.get(); + storage_.buffer[flat_id] = input[0]; ROCPRIM_UNROLL - for (unsigned int i = 0; i < (ItemsPerThread - 1); ++i) + for(unsigned int i = 0; i < (ItemsPerThread - 1); ++i) { - next[i] = input[i + 1]; + next[i] = input[i + 1]; } - if (flat_id <(BlockSize -1)) + ::rocprim::syncthreads(); + + if(flat_id < (BlockSize - 1)) { - next[ItemsPerThread -1] = storage_.next[flat_id + 1]; + next[ItemsPerThread - 1] = storage_.buffer[flat_id + 1]; } } @@ -526,16 +491,14 @@ class block_shuffle /// \param [out] next - The corresponding successor items (may be aliased to \p input). /// The item \p prev[0] is not updated for threadBlockSize - 1. /// \param [out] block_prefix - The item \p input[0] from thread0, provided to all threads - template - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void down(T (&input)[ItemsPerThread], - T (&next)[ItemsPerThread], - T &block_prefix) + template + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void + down(T (&input)[ItemsPerThread], T (&next)[ItemsPerThread], T& block_prefix) { - this->down( - ::rocprim::flat_block_thread_id(), - input, next, block_prefix - ); + this->down(::rocprim::flat_block_thread_id(), + input, + next, + block_prefix); } /// \brief The thread block rotates a blocked arrange of input items, @@ -546,12 +509,11 @@ class block_shuffle /// \param [out] next - The corresponding successor items (may be aliased to \p input). /// The item \p prev[0] is not updated for threadBlockSize - 1. /// \param [out] block_prefix - The item \p input[0] from thread0, provided to all threads - template - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void down(const size_t& flat_id, - T (&input)[ItemsPerThread], - T (&next)[ItemsPerThread], - T &block_prefix) + template + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(const size_t& flat_id, + T (&input)[ItemsPerThread], + T (&next)[ItemsPerThread], + T& block_prefix) { ROCPRIM_SHARED_MEMORY storage_type storage; this->down(flat_id, input, next, block_prefix, storage); @@ -566,13 +528,12 @@ class block_shuffle /// The item \p prev[0] is not updated for threadBlockSize - 1. /// \param [out] block_prefix - The item \p input[0] from thread0, provided to all threads /// \param [in] storage - reference to a temporary storage object of type storage_type. - template - ROCPRIM_DEVICE ROCPRIM_INLINE - void down(const size_t& flat_id, - T (&input)[ItemsPerThread], - T (&next)[ItemsPerThread], - T &block_prefix, - storage_type& storage) + template + ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t& flat_id, + T (&input)[ItemsPerThread], + T (&next)[ItemsPerThread], + T& block_prefix, + storage_type& storage) { this->down(flat_id, input, next, storage); @@ -581,7 +542,6 @@ class block_shuffle } }; - END_ROCPRIM_NAMESPACE /// @} diff --git a/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp b/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp index 21e7613f4..f5cfc031d 100644 --- a/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp +++ b/rocprim/include/rocprim/block/detail/block_histogram_atomic.hpp @@ -69,14 +69,8 @@ class block_histogram_atomic const unsigned int bin = static_cast(input[i]); // Get a mask with the threads that have the same value for `bin`. - ::rocprim::lane_mask_type peer_mask = ballot(1); - ROCPRIM_UNROLL - for(unsigned int b = 1; b < Bins; b <<= 1) - { - const unsigned int bit_set = bin & b; - const auto bit_set_mask = ballot(bit_set); - peer_mask &= (bit_set ? bit_set_mask : ~bit_set_mask); - } + ::rocprim::lane_mask_type peer_mask + = ::rocprim::match_any<::rocprim::Log2::VALUE>(bin); // The total number of threads in the warp which also have this digit. const unsigned int bin_count = bit_count(peer_mask); diff --git a/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp b/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp index a2c2723b7..abcdb3257 100644 --- a/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp +++ b/rocprim/include/rocprim/block/detail/block_radix_rank_match.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -113,15 +113,7 @@ class block_radix_rank_match const digit_counter_type warp_digit_prefix = *digit_counters[i]; // Construct a mask of threads in this wave which have the same digit. - ::rocprim::lane_mask_type peer_mask = ::rocprim::ballot(1); - - ROCPRIM_UNROLL - for(unsigned int b = 0; b < RadixBits; ++b) - { - const unsigned int bit_set = digit & (1u << b); - const ::rocprim::lane_mask_type bit_set_mask = ::rocprim::ballot(bit_set); - peer_mask &= (bit_set ? bit_set_mask : ~bit_set_mask); - } + ::rocprim::lane_mask_type peer_mask = ::rocprim::match_any(digit); ::rocprim::wave_barrier(); @@ -131,8 +123,7 @@ class block_radix_rank_match // than the current thread's. const unsigned int peer_digit_prefix = rocprim::masked_bit_count(peer_mask); - // The first thread with a particular digit gets to update the shared counter. - if(peer_digit_prefix == 0) + if(::rocprim::group_elect(peer_mask)) { *digit_counters[i] = warp_digit_prefix + digit_count; } diff --git a/rocprim/include/rocprim/config.hpp b/rocprim/include/rocprim/config.hpp index 5bc5e516f..b78afff25 100644 --- a/rocprim/include/rocprim/config.hpp +++ b/rocprim/include/rocprim/config.hpp @@ -72,8 +72,22 @@ #define ROCPRIM_FORCE_INLINE __attribute__((always_inline)) #endif -#ifndef ROCPRIM_DISABLE_DPP - #define ROCPRIM_DETAIL_USE_DPP true +// DPP is supported only after Volcanic Islands (GFX8+) +// Only defined when support is present, in contrast to ROCPRIM_DETAIL_USE_DPP, which should be +// always defined +#if defined(__HIP_DEVICE_COMPILE__) && defined(__AMDGCN__) \ + && (!defined(__GFX6__) && !defined(__GFX7__)) + #define ROCPRIM_DETAIL_HAS_DPP 1 +#endif + +#if !defined(ROCPRIM_DISABLE_DPP) && defined(ROCPRIM_DETAIL_HAS_DPP) + #define ROCPRIM_DETAIL_USE_DPP 1 +#else + #define ROCPRIM_DETAIL_USE_DPP 0 +#endif + +#if defined(ROCPRIM_DETAIL_HAS_DPP) && (defined(__GFX8__) || defined(__GFX9__)) + #define ROCPRIM_DETAIL_HAS_DPP_BROADCAST 1 #endif #ifndef ROCPRIM_THREAD_LOAD_USE_CACHE_MODIFIERS @@ -95,11 +109,12 @@ #define ROCPRIM_TARGET_ARCH 0 #endif -#if(__gfx1010__ || __gfx1011__ || __gfx1012__ || __gfx1030__ || __gfx1031__ || __gfx1032__ \ - || __gfx1035__ || __gfx1100__ || __gfx1101__ || __gfx1102__) - #define ROCPRIM_NAVI 1 -#else - #define ROCPRIM_NAVI 0 +#ifndef ROCPRIM_NAVI + #if defined(__HIP_DEVICE_COMPILE__) && (defined(__GFX10__) || defined(__GFX11__)) + #define ROCPRIM_NAVI 1 + #else + #define ROCPRIM_NAVI 0 + #endif #endif #define ROCPRIM_ARCH_90a 910 diff --git a/rocprim/include/rocprim/detail/radix_sort.hpp b/rocprim/include/rocprim/detail/radix_sort.hpp index 66ba2e356..32ff17e87 100644 --- a/rocprim/include/rocprim/detail/radix_sort.hpp +++ b/rocprim/include/rocprim/detail/radix_sort.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -67,6 +67,33 @@ struct radix_key_codec_integral +struct radix_key_codec_integral< + Key, + BitKey, + typename std::enable_if::value>::type> +{ + using bit_key_type = BitKey; + + ROCPRIM_DEVICE ROCPRIM_INLINE static bit_key_type encode(Key key) + { + return __builtin_bit_cast(bit_key_type, key); + } + + ROCPRIM_DEVICE ROCPRIM_INLINE static Key decode(bit_key_type bit_key) + { + return __builtin_bit_cast(Key, bit_key); + } + + template + ROCPRIM_DEVICE static unsigned int + extract_digit(bit_key_type bit_key, unsigned int start, unsigned int length) + { + unsigned int mask = (1u << length) - 1; + return static_cast(bit_key >> start) & mask; + } +}; + template struct radix_key_codec_integral::value>::type> { @@ -97,6 +124,36 @@ struct radix_key_codec_integral +struct radix_key_codec_integral::value>::type> +{ + using bit_key_type = BitKey; + + static constexpr bit_key_type sign_bit = bit_key_type(1) << (sizeof(bit_key_type) * 8 - 1); + + ROCPRIM_DEVICE ROCPRIM_INLINE static bit_key_type encode(Key key) + { + const bit_key_type bit_key = __builtin_bit_cast(bit_key_type, key); + return sign_bit ^ bit_key; + } + + ROCPRIM_DEVICE ROCPRIM_INLINE static Key decode(bit_key_type bit_key) + { + bit_key ^= sign_bit; + return __builtin_bit_cast(Key, bit_key); + } + + template + ROCPRIM_DEVICE static unsigned int + extract_digit(bit_key_type bit_key, unsigned int start, unsigned int length) + { + unsigned int mask = (1u << length) - 1; + return static_cast(bit_key >> start) & mask; + } +}; + template struct float_bit_mask; @@ -199,6 +256,18 @@ struct radix_key_codec_base< typename std::enable_if<::rocprim::is_integral::value>::type > : radix_key_codec_integral::type> { }; +template +struct radix_key_codec_base::value>::type> + : radix_key_codec_integral +{}; + +template +struct radix_key_codec_base::value>::type> + : radix_key_codec_integral +{}; + template<> struct radix_key_codec_base { diff --git a/rocprim/include/rocprim/device/config_types.hpp b/rocprim/include/rocprim/device/config_types.hpp index c83934cbc..0b8c75cd8 100644 --- a/rocprim/include/rocprim/device/config_types.hpp +++ b/rocprim/include/rocprim/device/config_types.hpp @@ -29,7 +29,6 @@ #include #include "../config.hpp" -#include "../intrinsics/thread.hpp" #include "../detail/various.hpp" /// \addtogroup primitivesmodule_deviceconfigs @@ -49,7 +48,7 @@ struct default_config // merge_sort_config using block_sort_config = default_config; using block_merge_config = default_config; - // radix_sort_config_v2 + // radix_sort_config using single_sort_config = default_config; using merge_sort_config = default_config; using onesweep_config = default_config; @@ -227,7 +226,7 @@ constexpr target_arch get_target_arch_from_name(const char* const arch_name, con /** * \brief Get the current architecture in device compilation. * - * This function will always return `unkown` when called from the host, host could should instead + * This function will always return `unknown` when called from the host, host could should instead * call host_target_arch to query the current device from the HIP API. * * \return target_arch the architecture currently being compiled for on the device. @@ -318,7 +317,6 @@ inline hipError_t get_device_arch(int device_id, target_arch& arch) return hipSuccess; } -#ifndef _WIN32 inline hipError_t get_device_from_stream(const hipStream_t stream, int& device_id) { static constexpr hipStream_t default_stream = 0; @@ -343,15 +341,9 @@ inline hipError_t get_device_from_stream(const hipStream_t stream, int& device_i #endif return hipSuccess; } -#endif inline hipError_t host_target_arch(const hipStream_t stream, target_arch& arch) { -#ifdef _WIN32 - (void)stream; - arch = target_arch::unknown; - return hipSuccess; -#else int device_id; const hipError_t result = get_device_from_stream(stream, device_id); if(result != hipSuccess) @@ -360,11 +352,48 @@ inline hipError_t host_target_arch(const hipStream_t stream, target_arch& arch) } return get_device_arch(device_id, arch); -#endif } } // end namespace detail +/// \brief Returns a number of threads in a hardware warp for the actual device. +/// At host side this constant is available at runtime only. +/// \param device_id - the device that should be queried. +/// \param warp_size - out parameter for the warp size. +/// \return hipError_t any error that might occur. +/// +/// It is constant for a device. +ROCPRIM_HOST inline hipError_t host_warp_size(const int device_id, unsigned int& warp_size) +{ + warp_size = -1; + hipDeviceProp_t device_prop; + hipError_t success = hipGetDeviceProperties(&device_prop, device_id); + + if(success == hipSuccess) + { + warp_size = device_prop.warpSize; + } + return success; +}; + +/// \brief Returns the number of threads in a hardware warp for the device associated with the stream. +/// At host side this constant is available at runtime only. +/// \param stream - the stream, whose device should be queried. +/// \param warp_size - out parameter for the warp size. +/// \return hipError_t any error that might occur. +/// +/// It is constant for a device. +ROCPRIM_HOST inline hipError_t host_warp_size(const hipStream_t stream, unsigned int& warp_size) +{ + int hip_device; + hipError_t success = detail::get_device_from_stream(stream, hip_device); + if(success == hipSuccess) + { + return host_warp_size(hip_device, warp_size); + } + return success; +}; + END_ROCPRIM_NAMESPACE /// @} diff --git a/rocprim/include/rocprim/device/detail/config/device_adjacent_difference.hpp b/rocprim/include/rocprim/device/detail/config/device_adjacent_difference.hpp new file mode 100644 index 000000000..8f5cdabdb --- /dev/null +++ b/rocprim/include/rocprim/device/detail/config/device_adjacent_difference.hpp @@ -0,0 +1,527 @@ +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_HPP_ +#define ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_HPP_ + +#include "../../../type_traits.hpp" +#include "../device_config_helper.hpp" +#include + +/* DO NOT EDIT THIS FILE + * This file is automatically generated by `/scripts/autotune/create_optimization.py`. + * so most likely you want to edit rocprim/device/device_(algo)_config.hpp + */ + +/// \addtogroup primitivesmodule_deviceconfigs +/// @{ + +BEGIN_ROCPRIM_NAMESPACE + +namespace detail +{ + +template +struct default_adjacent_difference_config : default_adjacent_difference_config_base +{}; +// Based on value_type = double +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<512, 4> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 2> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<128, 4> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<512, 4> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 2> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<256, 4> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<32, 8> +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<128, 8> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 2> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<32, 4> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<1024, 2> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 2> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<32, 4> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<256, 32> +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<128, 64> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<128, 64> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16> +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<128, 2> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<512, 4> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<64, 16> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<128, 2> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<512, 4> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<64, 16> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16> +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<128, 2> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<64, 8> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<128, 2> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<128, 2> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<128, 8> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16> +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<128, 2> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<64, 8> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<128, 2> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<128, 2> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<128, 8> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16> +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<128, 2> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<64, 8> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<128, 2> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<128, 2> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<128, 8> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16> +{}; + +} // end namespace detail + +END_ROCPRIM_NAMESPACE + +/// @} +// end of group primitivesmodule_deviceconfigs + +#endif // ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_HPP_ \ No newline at end of file diff --git a/rocprim/include/rocprim/device/detail/config/device_adjacent_difference_inplace.hpp b/rocprim/include/rocprim/device/detail/config/device_adjacent_difference_inplace.hpp new file mode 100644 index 000000000..0718b6e65 --- /dev/null +++ b/rocprim/include/rocprim/device/detail/config/device_adjacent_difference_inplace.hpp @@ -0,0 +1,529 @@ +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_INPLACE_HPP_ +#define ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_INPLACE_HPP_ + +#include "../../../type_traits.hpp" +#include "../device_config_helper.hpp" +#include + +/* DO NOT EDIT THIS FILE + * This file is automatically generated by `/scripts/autotune/create_optimization.py`. + * so most likely you want to edit rocprim/device/device_(algo)_config.hpp + */ + +/// \addtogroup primitivesmodule_deviceconfigs +/// @{ + +BEGIN_ROCPRIM_NAMESPACE + +namespace detail +{ + +template +struct default_adjacent_difference_inplace_config + : default_adjacent_difference_config_base +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<128, 16> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<256, 16> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<512, 16> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<128, 16> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<256, 16> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<256, 32> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1102), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<512, 32> +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<512, 4> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 4> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<1024, 8> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<512, 4> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 4> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<1024, 8> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<32, 64> +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<256, 16> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<128, 64> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<256, 64> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<256, 16> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<128, 64> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<256, 64> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx900), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<512, 16> +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<1024, 4> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 8> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<256, 16> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<1024, 4> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<512, 16> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<256, 16> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16> +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 4> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<512, 8> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 4> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<64, 32> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16> +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 4> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<512, 8> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 4> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<64, 32> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16> +{}; + +// Based on value_type = double +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = float +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 4> +{}; + +// Based on value_type = rocprim::half +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2))>> : adjacent_difference_config<512, 8> +{}; + +// Based on value_type = int64_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : adjacent_difference_config<512, 2> +{}; + +// Based on value_type = int +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : adjacent_difference_config<1024, 4> +{}; + +// Based on value_type = short +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : adjacent_difference_config<64, 32> +{}; + +// Based on value_type = int8_t +template +struct default_adjacent_difference_inplace_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<(!bool(rocprim::is_floating_point::value) + && (sizeof(value_type) <= 1))>> : adjacent_difference_config<64, 16> +{}; + +} // end namespace detail + +END_ROCPRIM_NAMESPACE + +/// @} +// end of group primitivesmodule_deviceconfigs + +#endif // ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_INPLACE_HPP_ \ No newline at end of file diff --git a/rocprim/include/rocprim/device/detail/config/device_scan.hpp b/rocprim/include/rocprim/device/detail/config/device_scan.hpp index 867753a07..7fe8e7259 100644 --- a/rocprim/include/rocprim/device/detail/config/device_scan.hpp +++ b/rocprim/include/rocprim/device/detail/config/device_scan.hpp @@ -49,11 +49,11 @@ struct default_scan_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_config_v2<256, - 6, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 6, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = float @@ -63,11 +63,11 @@ struct default_scan_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = rocprim::half @@ -76,11 +76,11 @@ struct default_scan_config(target_arch::gfx908), value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 2))>> - : scan_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = int64_t @@ -90,11 +90,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_config_v2<256, - 6, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 6, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int @@ -104,11 +104,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = short @@ -118,11 +118,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int8_t @@ -131,11 +131,11 @@ struct default_scan_config(target_arch::gfx908), value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 1))>> - : scan_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = double @@ -145,11 +145,11 @@ struct default_scan_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_config_v2<256, - 15, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 15, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = float @@ -159,11 +159,11 @@ struct default_scan_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_config_v2<256, - 10, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 10, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = rocprim::half @@ -172,11 +172,11 @@ struct default_scan_config(target_arch::gfx900), value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 2))>> - : scan_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int64_t @@ -186,11 +186,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_config_v2<256, - 10, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 10, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int @@ -200,11 +200,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_config_v2<256, - 10, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 10, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = short @@ -214,11 +214,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int8_t @@ -227,11 +227,11 @@ struct default_scan_config(target_arch::gfx900), value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 1))>> - : scan_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = double @@ -241,11 +241,11 @@ struct default_scan_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_config_v2<128, - 10, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<128, + 10, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = float @@ -255,11 +255,11 @@ struct default_scan_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_config_v2<256, - 15, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 15, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = rocprim::half @@ -268,11 +268,11 @@ struct default_scan_config(target_arch::gfx906), value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 2))>> - : scan_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = int64_t @@ -282,11 +282,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_config_v2<64, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<64, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int @@ -296,11 +296,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_config_v2<256, - 15, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 15, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = short @@ -310,11 +310,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = int8_t @@ -323,11 +323,11 @@ struct default_scan_config(target_arch::gfx906), value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 1))>> - : scan_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = double @@ -337,11 +337,11 @@ struct default_scan_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = float @@ -351,11 +351,11 @@ struct default_scan_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = rocprim::half @@ -364,11 +364,11 @@ struct default_scan_config(target_arch::gfx1030), value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 2))>> - : scan_config_v2<256, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int64_t @@ -378,11 +378,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_config_v2<256, - 9, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 9, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int @@ -392,11 +392,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = short @@ -406,11 +406,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_config_v2<256, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int8_t @@ -419,11 +419,11 @@ struct default_scan_config(target_arch::gfx1030), value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 1))>> - : scan_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = double @@ -433,11 +433,11 @@ struct default_scan_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_config_v2<256, - 6, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 6, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = float @@ -447,11 +447,11 @@ struct default_scan_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = rocprim::half @@ -460,11 +460,11 @@ struct default_scan_config(target_arch::unknown), value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 2))>> - : scan_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = int64_t @@ -474,11 +474,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_config_v2<256, - 6, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 6, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int @@ -488,11 +488,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = short @@ -502,11 +502,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int8_t @@ -515,11 +515,11 @@ struct default_scan_config(target_arch::unknown), value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 1))>> - : scan_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = double @@ -529,11 +529,11 @@ struct default_scan_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_config_v2<256, - 6, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 6, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = float @@ -543,11 +543,11 @@ struct default_scan_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = rocprim::half @@ -556,11 +556,11 @@ struct default_scan_config(target_arch::gfx90a), value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 2))>> - : scan_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on value_type = int64_t @@ -570,11 +570,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_config_v2<256, - 6, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 6, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int @@ -584,11 +584,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = short @@ -598,11 +598,11 @@ struct default_scan_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on value_type = int8_t @@ -611,11 +611,11 @@ struct default_scan_config(target_arch::gfx90a), value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(value_type) <= 1))>> - : scan_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; } // end namespace detail diff --git a/rocprim/include/rocprim/device/detail/config/device_scan_by_key.hpp b/rocprim/include/rocprim/device/detail/config/device_scan_by_key.hpp index 90ae38b42..f6f375f24 100644 --- a/rocprim/include/rocprim/device/detail/config/device_scan_by_key.hpp +++ b/rocprim/include/rocprim/device/detail/config/device_scan_by_key.hpp @@ -51,11 +51,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = int @@ -67,11 +67,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = double, value_type = short @@ -83,11 +83,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = int8_t @@ -98,11 +98,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = int64_t @@ -114,11 +114,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<128, - 10, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 10, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = float, value_type = int @@ -130,11 +130,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 12, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 12, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = short @@ -146,11 +146,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = int8_t @@ -161,11 +161,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = int64_t @@ -176,11 +176,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = int @@ -191,11 +191,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = short @@ -206,11 +206,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = rocprim::half, value_type = int8_t @@ -221,11 +221,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int64_t @@ -237,11 +237,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int64_t, value_type = int @@ -253,11 +253,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int64_t, value_type = short @@ -269,11 +269,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int8_t @@ -284,11 +284,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int, value_type = int64_t @@ -300,11 +300,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<128, - 10, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 10, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int, value_type = int @@ -316,11 +316,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 12, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 12, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = short @@ -332,11 +332,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = int8_t @@ -347,11 +347,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = int64_t @@ -363,11 +363,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = short, value_type = int @@ -379,11 +379,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = short @@ -395,11 +395,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = short, value_type = int8_t @@ -410,11 +410,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = int64_t @@ -425,11 +425,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = int @@ -440,11 +440,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<128, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = short @@ -455,11 +455,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int8_t, value_type = int8_t @@ -470,11 +470,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = double, value_type = int64_t @@ -486,11 +486,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = double, value_type = int @@ -502,11 +502,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<128, - 19, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<128, + 19, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = double, value_type = short @@ -518,11 +518,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = double, value_type = int8_t @@ -533,11 +533,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = float, value_type = int64_t @@ -549,11 +549,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 10, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 10, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = int @@ -565,11 +565,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 12, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 12, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = short @@ -581,11 +581,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = int8_t @@ -596,11 +596,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = int64_t @@ -611,11 +611,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = rocprim::half, value_type = int @@ -626,11 +626,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = short @@ -641,11 +641,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = rocprim::half, value_type = int8_t @@ -656,11 +656,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int64_t @@ -672,11 +672,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int @@ -688,11 +688,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<128, - 19, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<128, + 19, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int64_t, value_type = short @@ -704,11 +704,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int64_t, value_type = int8_t @@ -719,11 +719,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int, value_type = int64_t @@ -735,11 +735,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 10, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 10, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int, value_type = int @@ -751,11 +751,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 12, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 12, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = short @@ -767,11 +767,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = int8_t @@ -782,11 +782,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = int64_t @@ -798,11 +798,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = int @@ -814,11 +814,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = short @@ -830,11 +830,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = short, value_type = int8_t @@ -845,11 +845,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = int64_t @@ -860,11 +860,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = int @@ -875,11 +875,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 17, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 17, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = short @@ -890,11 +890,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int8_t, value_type = int8_t @@ -905,11 +905,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = int64_t @@ -921,11 +921,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = int @@ -937,11 +937,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = short @@ -953,11 +953,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 23, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 23, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = int8_t @@ -968,11 +968,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = float, value_type = int64_t @@ -984,11 +984,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 15, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 15, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = int @@ -1000,11 +1000,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 15, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 15, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = short @@ -1016,11 +1016,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = float, value_type = int8_t @@ -1031,11 +1031,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = rocprim::half, value_type = int64_t @@ -1046,11 +1046,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = rocprim::half, value_type = int @@ -1061,11 +1061,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = short @@ -1076,11 +1076,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = rocprim::half, value_type = int8_t @@ -1091,11 +1091,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int64_t @@ -1107,11 +1107,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int @@ -1123,11 +1123,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = short @@ -1139,11 +1139,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 23, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 23, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int64_t, value_type = int8_t @@ -1154,11 +1154,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int, value_type = int64_t @@ -1170,11 +1170,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 15, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 15, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = int @@ -1186,11 +1186,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 15, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 15, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = short @@ -1202,11 +1202,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int, value_type = int8_t @@ -1217,11 +1217,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = int64_t @@ -1233,11 +1233,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = int @@ -1249,11 +1249,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = short @@ -1265,11 +1265,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = int8_t @@ -1280,11 +1280,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = int64_t @@ -1295,11 +1295,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int8_t, value_type = int @@ -1310,11 +1310,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 13, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 13, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = short @@ -1325,11 +1325,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = int8_t @@ -1340,11 +1340,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = double, value_type = int64_t @@ -1356,11 +1356,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<128, - 23, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 23, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = int @@ -1372,11 +1372,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<128, - 23, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<128, + 23, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = double, value_type = short @@ -1388,11 +1388,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = int8_t @@ -1403,11 +1403,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 17, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 17, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = float, value_type = int64_t @@ -1419,11 +1419,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 9, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 9, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = float, value_type = int @@ -1435,11 +1435,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 15, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 15, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = float, value_type = short @@ -1451,11 +1451,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = int8_t @@ -1466,11 +1466,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<128, - 7, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<128, + 7, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = rocprim::half, value_type = int64_t @@ -1481,11 +1481,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 23, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 23, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = int @@ -1496,11 +1496,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<128, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = short @@ -1511,11 +1511,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<128, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = int8_t @@ -1526,11 +1526,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int64_t @@ -1542,11 +1542,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 13, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 13, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int @@ -1558,11 +1558,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<128, - 19, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 19, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = short @@ -1574,11 +1574,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int8_t @@ -1589,11 +1589,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = int64_t @@ -1605,11 +1605,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 9, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 9, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int, value_type = int @@ -1621,11 +1621,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<128, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<128, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = short @@ -1637,11 +1637,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = int8_t @@ -1652,11 +1652,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<128, - 7, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<128, + 7, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = short, value_type = int64_t @@ -1668,11 +1668,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 9, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 9, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = int @@ -1684,11 +1684,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<128, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = short @@ -1700,11 +1700,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<128, - 22, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 22, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = int8_t @@ -1715,11 +1715,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = int64_t @@ -1730,11 +1730,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<128, - 17, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<128, + 17, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int8_t, value_type = int @@ -1745,11 +1745,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 12, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 12, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = short @@ -1760,11 +1760,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<128, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = int8_t @@ -1775,11 +1775,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = int64_t @@ -1791,11 +1791,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = int @@ -1807,11 +1807,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = double, value_type = short @@ -1823,11 +1823,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = int8_t @@ -1838,11 +1838,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = int64_t @@ -1854,11 +1854,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<128, - 10, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 10, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = float, value_type = int @@ -1870,11 +1870,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 12, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 12, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = short @@ -1886,11 +1886,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = int8_t @@ -1901,11 +1901,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = int64_t @@ -1916,11 +1916,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = int @@ -1931,11 +1931,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = short @@ -1946,11 +1946,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = rocprim::half, value_type = int8_t @@ -1961,11 +1961,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int64_t @@ -1977,11 +1977,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int64_t, value_type = int @@ -1993,11 +1993,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int64_t, value_type = short @@ -2009,11 +2009,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int8_t @@ -2024,11 +2024,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int, value_type = int64_t @@ -2040,11 +2040,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<128, - 10, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 10, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int, value_type = int @@ -2056,11 +2056,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 12, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 12, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = short @@ -2072,11 +2072,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = int8_t @@ -2087,11 +2087,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = int64_t @@ -2103,11 +2103,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = short, value_type = int @@ -2119,11 +2119,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = short @@ -2135,11 +2135,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = short, value_type = int8_t @@ -2150,11 +2150,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = int64_t @@ -2165,11 +2165,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = int @@ -2180,11 +2180,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<128, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = short @@ -2195,11 +2195,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int8_t, value_type = int8_t @@ -2210,11 +2210,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = double, value_type = int64_t @@ -2226,11 +2226,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = int @@ -2242,11 +2242,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = double, value_type = short @@ -2258,11 +2258,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = double, value_type = int8_t @@ -2273,11 +2273,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<64, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = int64_t @@ -2289,11 +2289,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<128, - 10, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 10, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = float, value_type = int @@ -2305,11 +2305,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 12, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 12, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = short @@ -2321,11 +2321,11 @@ struct default_scan_by_key_config< std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = float, value_type = int8_t @@ -2336,11 +2336,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = int64_t @@ -2351,11 +2351,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = int @@ -2366,11 +2366,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = rocprim::half, value_type = short @@ -2381,11 +2381,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = rocprim::half, value_type = int8_t @@ -2396,11 +2396,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int64_t @@ -2412,11 +2412,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int64_t, value_type = int @@ -2428,11 +2428,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int64_t, value_type = short @@ -2444,11 +2444,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<64, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int64_t, value_type = int8_t @@ -2459,11 +2459,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 8) && (sizeof(key_type) > 4) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int, value_type = int64_t @@ -2475,11 +2475,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<128, - 10, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 10, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int, value_type = int @@ -2491,11 +2491,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 12, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 12, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = short @@ -2507,11 +2507,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int, value_type = int8_t @@ -2522,11 +2522,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 4) && (sizeof(key_type) > 2) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<128, - 14, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 14, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = int64_t @@ -2538,11 +2538,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = short, value_type = int @@ -2554,11 +2554,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<256, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = short, value_type = short @@ -2570,11 +2570,11 @@ struct default_scan_by_key_config< std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = short, value_type = int8_t @@ -2585,11 +2585,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 2) && (sizeof(key_type) > 1) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = int64_t @@ -2600,11 +2600,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> - : scan_by_key_config_v2<64, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<64, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = int @@ -2615,11 +2615,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> - : scan_by_key_config_v2<128, - 18, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::using_warp_scan> + : scan_by_key_config<128, + 18, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::using_warp_scan> {}; // Based on key_type = int8_t, value_type = short @@ -2630,11 +2630,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> - : scan_by_key_config_v2<256, - 20, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 20, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; // Based on key_type = int8_t, value_type = int8_t @@ -2645,11 +2645,11 @@ struct default_scan_by_key_config< value_type, std::enable_if_t<(!bool(rocprim::is_floating_point::value) && (sizeof(key_type) <= 1) && (sizeof(value_type) <= 1))>> - : scan_by_key_config_v2<256, - 24, - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - block_scan_algorithm::reduce_then_scan> + : scan_by_key_config<256, + 24, + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + block_scan_algorithm::reduce_then_scan> {}; } // end namespace detail diff --git a/rocprim/include/rocprim/device/detail/device_adjacent_difference.hpp b/rocprim/include/rocprim/device/detail/device_adjacent_difference.hpp index fca5bc979..4a7592b5d 100644 --- a/rocprim/include/rocprim/device/detail/device_adjacent_difference.hpp +++ b/rocprim/include/rocprim/device/detail/device_adjacent_difference.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -28,6 +28,7 @@ #include "../../detail/various.hpp" #include "../../config.hpp" +#include "device_config_helper.hpp" #include @@ -180,18 +181,25 @@ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void adjacent_difference_kernel_impl( using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; - static constexpr unsigned int block_size = Config::block_size; - static constexpr unsigned int items_per_thread = Config::items_per_thread; + static constexpr adjacent_difference_config_params params = device_params(); + + static constexpr unsigned int block_size = params.adjacent_difference_kernel_config.block_size; + static constexpr unsigned int items_per_thread + = params.adjacent_difference_kernel_config.items_per_thread; static constexpr unsigned int items_per_block = block_size * items_per_thread; using block_load_type - = ::rocprim::block_load; - using block_store_type - = ::rocprim::block_store; + = ::rocprim::block_load; + using block_store_type = ::rocprim:: + block_store; using adjacent_helper = adjacent_diff_helper; +#if defined(__gfx1102__) or defined(__gfx1030__) ROCPRIM_SHARED_MEMORY struct +#else + ROCPRIM_SHARED_MEMORY union +#endif { typename block_load_type::storage_type load; typename adjacent_helper::storage_type adjacent_diff; diff --git a/rocprim/include/rocprim/device/detail/device_config_helper.hpp b/rocprim/include/rocprim/device/detail/device_config_helper.hpp index 999df33cc..532b8d8cd 100644 --- a/rocprim/include/rocprim/device/detail/device_config_helper.hpp +++ b/rocprim/include/rocprim/device/detail/device_config_helper.hpp @@ -47,7 +47,6 @@ namespace detail struct merge_sort_block_sort_config_params { kernel_config_params block_sort_config = {0, 0}; - block_sort_algorithm block_sort_method = block_sort_algorithm::stable_merge_sort; }; // Necessary to construct a parameterized type of `merge_sort_block_sort_config_params`. @@ -57,7 +56,7 @@ struct merge_sort_block_sort_config : rocprim::detail::merge_sort_block_sort_con { using sort_config = kernel_config; constexpr merge_sort_block_sort_config() - : rocprim::detail::merge_sort_block_sort_config_params{sort_config(), Algo} {}; + : rocprim::detail::merge_sort_block_sort_config_params{sort_config()} {}; }; constexpr unsigned int merge_sort_items_per_thread(const unsigned int item_scale) @@ -206,6 +205,9 @@ struct radix_sort_onesweep_config : detail::radix_sort_onesweep_config_params namespace detail { +struct reduce_config_tag +{}; + // Calculate kernel configurations, such that it will not exceed shared memory maximum template struct radix_sort_onesweep_config_base @@ -240,6 +242,8 @@ template struct reduce_config : rocprim::detail::reduce_config_params { + /// \brief Identifies the algorithm associated to the config. + using tag = detail::reduce_config_tag; constexpr reduce_config() : rocprim::detail::reduce_config_params{ {BlockSize, ItemsPerThread, SizeLimit}, @@ -265,6 +269,9 @@ template struct default_reduce_config_base : default_reduce_config_base_helper::type {}; +struct scan_config_tag +{}; + /// \brief Provides the kernel parameters for exclusive_scan and inclusive_scan based /// on autotuned configurations or user-provided configurations. struct scan_config_params @@ -291,8 +298,10 @@ template -struct scan_config_v2 : ::rocprim::detail::scan_config_params +struct scan_config : ::rocprim::detail::scan_config_params { + /// \brief Identifies the algorithm associated to the config. + using tag = detail::scan_config_tag; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Requirement dictated by init_lookback_scan_state_kernel. static_assert(BlockSize <= ROCPRIM_DEFAULT_MAX_BLOCK_SIZE, @@ -311,54 +320,6 @@ struct scan_config_v2 : ::rocprim::detail::scan_config_params /// \brief Limit on the number of items for a single scan kernel launch. static constexpr unsigned int size_limit = SizeLimit; - constexpr scan_config_v2() - : ::rocprim::detail::scan_config_params{ - {BlockSize, ItemsPerThread, SizeLimit}, - BlockLoadMethod, - BlockStoreMethod, - BlockScanMethod - } {}; -#endif -}; - -/// \brief Deprecated: Configuration of device-level scan primitives. -/// -/// \tparam BlockSize - number of threads in a block. -/// \tparam ItemsPerThread - number of items processed by each thread. -/// \tparam UseLookback - deprecated, scan always uses lookback scan. -/// \tparam BlockLoadMethod - method for loading input values. -/// \tparam StoreLoadMethod - method for storing values. -/// \tparam BlockScanMethod - algorithm for block scan. -/// \tparam SizeLimit - limit on the number of items for a single scan kernel launch. -template -struct -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen seems to have trouble with the syntax used in this definition -[[deprecated("The UseLookback switch has been removed, as scan now only supports the " - "lookback-scan implementation. Use scan_config_v2 instead.")]] -#endif -scan_config : ::rocprim::detail::scan_config_params -{ - /// \brief Number of threads in a block. - static constexpr unsigned int block_size = BlockSize; - /// \brief Number of items processed by each thread. - static constexpr unsigned int items_per_thread = ItemsPerThread; - /// \brief Whether to use lookback scan or reduce-then-scan algorithm. - static constexpr bool use_lookback = UseLookback; - /// \brief Method for loading input values. - static constexpr ::rocprim::block_load_method block_load_method = BlockLoadMethod; - /// \brief Method for storing values. - static constexpr ::rocprim::block_store_method block_store_method = BlockStoreMethod; - /// \brief Algorithm for block scan. - static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod; - /// \brief Limit on the number of items for a single scan kernel launch. - static constexpr unsigned int size_limit = SizeLimit; - constexpr scan_config() : ::rocprim::detail::scan_config_params{ {BlockSize, ItemsPerThread, SizeLimit}, @@ -366,22 +327,26 @@ scan_config : ::rocprim::detail::scan_config_params BlockStoreMethod, BlockScanMethod } {}; +#endif }; namespace detail { +struct scan_by_key_config_tag +{}; + template struct default_scan_config_base_helper { static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); - using type = scan_config_v2::value, - ::rocprim::max(1u, 16u / item_scale), - ::rocprim::block_load_method::block_load_transpose, - ::rocprim::block_store_method::block_store_transpose, - ::rocprim::block_scan_algorithm::using_warp_scan>; + using type = scan_config::value, + ::rocprim::max(1u, 16u / item_scale), + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose, + ::rocprim::block_scan_algorithm::using_warp_scan>; }; template @@ -414,8 +379,10 @@ template -struct scan_by_key_config_v2 : ::rocprim::detail::scan_by_key_config_params +struct scan_by_key_config : ::rocprim::detail::scan_by_key_config_params { + /// \brief Identifies the algorithm associated to the config. + using tag = detail::scan_by_key_config_tag; #ifndef DOXYGEN_SHOULD_SKIP_THIS // Requirement dictated by init_lookback_scan_state_kernel. static_assert(BlockSize <= ROCPRIM_DEFAULT_MAX_BLOCK_SIZE, @@ -434,55 +401,6 @@ struct scan_by_key_config_v2 : ::rocprim::detail::scan_by_key_config_params /// \brief Limit on the number of items for a single scan kernel launch. static constexpr unsigned int size_limit = SizeLimit; - constexpr scan_by_key_config_v2() - : ::rocprim::detail::scan_by_key_config_params{ - {BlockSize, ItemsPerThread, SizeLimit}, - BlockLoadMethod, - BlockStoreMethod, - BlockScanMethod - } {}; -#endif -}; - -/// \brief Deprecated: Configuration of device-level scan-by-key operation. -/// -/// \tparam BlockSize - number of threads in a block. -/// \tparam ItemsPerThread - number of items processed by each thread. -/// \tparam UseLookback - deprecated, scan always uses lookback scan. -/// \tparam BlockLoadMethod - method for loading input values. -/// \tparam StoreLoadMethod - method for storing values. -/// \tparam BlockScanMethod - algorithm for block scan. -/// \tparam SizeLimit - limit on the number of items for a single scan kernel launch. -template -struct -#ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen seems to have trouble with the syntax used in this definition -[[deprecated( - "The UseLookback switch has been removed, as scan now only supports the lookback-scan " - "implementation. Use scan_by_key_config_v2 instead.")]] -#endif -scan_by_key_config : ::rocprim::detail::scan_by_key_config_params -{ - /// \brief Number of threads in a block. - static constexpr unsigned int block_size = BlockSize; - /// \brief Number of items processed by each thread. - static constexpr unsigned int items_per_thread = ItemsPerThread; - /// \brief Whether to use lookback scan or reduce-then-scan algorithm. - static constexpr bool use_lookback = UseLookback; - /// \brief Method for loading input values. - static constexpr ::rocprim::block_load_method block_load_method = BlockLoadMethod; - /// \brief Method for storing values. - static constexpr ::rocprim::block_store_method block_store_method = BlockStoreMethod; - /// \brief Algorithm for block scan. - static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod; - /// \brief Limit on the number of items for a single scan kernel launch. - static constexpr unsigned int size_limit = SizeLimit; - constexpr scan_by_key_config() : ::rocprim::detail::scan_by_key_config_params{ {BlockSize, ItemsPerThread, SizeLimit}, @@ -490,6 +408,7 @@ scan_by_key_config : ::rocprim::detail::scan_by_key_config_params BlockStoreMethod, BlockScanMethod } {}; +#endif }; namespace detail @@ -501,7 +420,7 @@ struct default_scan_by_key_config_base_helper static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div( sizeof(Key) + sizeof(Value), 2 * sizeof(int)); - using type = scan_by_key_config_v2< + using type = scan_by_key_config< limit_block_size<256U, sizeof(Key) + sizeof(Value), ROCPRIM_WARP_SIZE_64>::value, ::rocprim::max(1u, 16u / item_scale), ::rocprim::block_load_method::block_load_transpose, @@ -513,6 +432,9 @@ template struct default_scan_by_key_config_base : default_scan_by_key_config_base_helper::type {}; +struct transform_config_tag +{}; + struct transform_config_params { kernel_config_params kernel_config{}; @@ -527,8 +449,10 @@ struct transform_config_params template -struct transform_config +struct transform_config : public detail::transform_config_params { + /// \brief Identifies the algorithm associated to the config. + using tag = detail::transform_config_tag; #ifndef DOXYGEN_SHOULD_SKIP_THIS /// \brief Number of threads in a block. @@ -540,6 +464,11 @@ struct transform_config /// \brief Limit on the number of items for a single kernel launch. static constexpr unsigned int size_limit = SizeLimit; + constexpr transform_config() + : detail::transform_config_params{ + {BlockSize, ItemsPerThread, SizeLimit} + } + {} #endif }; @@ -559,6 +488,13 @@ template struct default_transform_config_base : default_transform_config_base_helper::type {}; +struct binary_search_config_tag : public transform_config_tag +{}; +struct upper_bound_config_tag : public transform_config_tag +{}; +struct lower_bound_config_tag : public transform_config_tag +{}; + } // namespace detail /// \brief Configuration for the device-level binary search operation. @@ -569,7 +505,10 @@ template struct binary_search_config : transform_config -{}; +{ + /// \brief Identifies the algorithm associated to the config. + using tag = detail::binary_search_config_tag; +}; /// \brief Configuration for the device-level upper bound operation. /// \tparam BlockSize Number of threads in a block. @@ -579,7 +518,10 @@ template struct upper_bound_config : transform_config -{}; +{ + /// \brief Identifies the algorithm associated to the config. + using tag = detail::upper_bound_config_tag; +}; /// \brief Configuration for the device-level lower bound operation. /// \tparam BlockSize Number of threads in a block. @@ -589,11 +531,17 @@ template struct lower_bound_config : transform_config -{}; +{ + /// \brief Identifies the algorithm associated to the config. + using tag = detail::lower_bound_config_tag; +}; namespace detail { +struct histogram_config_tag +{}; + template struct default_binary_search_config_base : binary_search_config< @@ -630,6 +578,8 @@ template struct histogram_config : detail::histogram_config_params { + /// \brief Identifies the algorithm associated to the config. + using tag = detail::histogram_config_tag; #ifndef DOXYGEN_SHOULD_SKIP_THIS using histogram = HistogramConfig; @@ -661,6 +611,69 @@ struct default_histogram_config_base : default_histogram_config_base_helper::type {}; +struct adjacent_difference_config_tag +{}; + +struct adjacent_difference_config_params +{ + kernel_config_params adjacent_difference_kernel_config; + ::rocprim::block_load_method block_load_method; + ::rocprim::block_store_method block_store_method; +}; +} // namespace detail + +/// \brief Configuration of device-level adjacent difference primitives. +/// +/// \tparam BlockSize - number of threads in a block. +/// \tparam ItemsPerThread - number of items processed by each thread. +/// \tparam BlockLoadMethod - method for loading input values. +/// \tparam BlockStoreMethod - method for storing values. +/// \tparam SizeLimit - limit on the number of items for a single adjacent difference kernel launch. +template +struct adjacent_difference_config : public detail::adjacent_difference_config_params +{ + /// \brief Identifies the algorithm associated to the config. + using tag = detail::adjacent_difference_config_tag; +#ifndef DOXYGEN_SHOULD_SKIP_THIS + static constexpr ::rocprim::block_load_method block_load_method = BlockLoadMethod; + static constexpr ::rocprim::block_store_method block_store_method = BlockStoreMethod; + static constexpr unsigned int block_size = BlockSize; + static constexpr unsigned int items_per_thread = ItemsPerThread; + static constexpr unsigned int size_limit = SizeLimit; + + constexpr adjacent_difference_config() + : detail::adjacent_difference_config_params{ + {BlockSize, ItemsPerThread, SizeLimit}, + BlockLoadMethod, BlockStoreMethod + } {}; +#endif +}; + +namespace detail +{ + +template +struct default_adjacent_difference_config_base_helper +{ + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + + using type = adjacent_difference_config< + limit_block_size<256U, sizeof(Value), ROCPRIM_WARP_SIZE_64>::value, + ::rocprim::max(1u, 16u / item_scale), + ::rocprim::block_load_method::block_load_transpose, + ::rocprim::block_store_method::block_store_transpose>; +}; + +template +struct default_adjacent_difference_config_base + : default_adjacent_difference_config_base_helper::type +{}; + } // namespace detail END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/device/detail/device_histogram.hpp b/rocprim/include/rocprim/device/detail/device_histogram.hpp index 27453b3ff..6798e2be9 100644 --- a/rocprim/include/rocprim/device/detail/device_histogram.hpp +++ b/rocprim/include/rocprim/device/detail/device_histogram.hpp @@ -502,14 +502,12 @@ ROCPRIM_DEVICE ROCPRIM_INLINE void const lane_mask_type bit_set_mask = ::rocprim::ballot(bit_set); same_bin_lanes_mask &= (bit_set ? bit_set_mask : ~bit_set_mask); } - const unsigned int same_bin_count = ::rocprim::bit_count(same_bin_lanes_mask); - const unsigned int prev_same_bin_count - = ::rocprim::masked_bit_count(same_bin_lanes_mask); - if(prev_same_bin_count == 0) + if(::rocprim::group_elect(same_bin_lanes_mask)) { // Write the number of lanes having this bin, // if the current lane is the first (and maybe only) lane with this bin. - ::rocprim::detail::atomic_add(&histogram[channel][bin], same_bin_count); + ::rocprim::detail::atomic_add(&histogram[channel][bin], + ::rocprim::bit_count(same_bin_lanes_mask)); } } } diff --git a/rocprim/include/rocprim/device/detail/device_merge_sort.hpp b/rocprim/include/rocprim/device/detail/device_merge_sort.hpp index b28d9c2c2..eeefcaee9 100644 --- a/rocprim/include/rocprim/device/detail/device_merge_sort.hpp +++ b/rocprim/include/rocprim/device/detail/device_merge_sort.hpp @@ -318,145 +318,13 @@ struct block_permute_values_impl -struct block_sort_impl -{ - using stable_key_type = rocprim::tuple; - - using keys_load_type - = block_load; - - using sort_type - = block_sort; - - using keys_store_type - = block_store; - - using values_permute_type = block_permute_values_impl; - - union storage_type - { - typename keys_load_type::storage_type load_keys; - typename sort_type::storage_type sort; - typename keys_store_type::storage_type store_keys; - typename values_permute_type::storage_type permute_values; - }; - - template - ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE - void sort(const unsigned int valid_in_last_block, - const bool is_incomplete_block, - KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - BinaryFunction compare_function, - storage_type& storage) - { - // By default, the block sort algorithm is not stable. We can make it stable - // by adding an index to each key. - - Key keys[ItemsPerThread]; - - if(is_incomplete_block) - { - keys_load_type().load(keys_input, keys, valid_in_last_block, storage.load_keys); - } - else - { - keys_load_type().load(keys_input, keys, storage.load_keys); - } - - const auto flat_id = block_thread_id<0>(); - - stable_key_type stable_keys[ItemsPerThread]; - ROCPRIM_UNROLL - for(unsigned int i = 0; i < ItemsPerThread; ++i) - { - stable_keys[i] = rocprim::make_tuple(keys[i], flat_id * ItemsPerThread + i); - } - - syncthreads(); - - // Special compare function that enforces sorting is stable. - auto stable_compare_function - = [compare_function](const stable_key_type& a, - const stable_key_type& b) ROCPRIM_FORCE_INLINE mutable - { - const bool ab = compare_function(rocprim::get<0>(a), rocprim::get<0>(b)); - return ab - || (!compare_function(rocprim::get<0>(b), rocprim::get<0>(a)) - && (rocprim::get<1>(a) < rocprim::get<1>(b))); - }; - - if(is_incomplete_block) - { - // Special compare function that enforces sorting is stable, and that out-of-bounds elements - // are not compared. - auto stable_oob_compare_function - = [stable_compare_function, valid_in_last_block](const stable_key_type& a, - const stable_key_type& b) mutable - { - const bool a_oob = rocprim::get<1>(a) >= valid_in_last_block; - const bool b_oob = rocprim::get<1>(b) >= valid_in_last_block; - return a_oob || b_oob ? !a_oob : stable_compare_function(a, b); - }; - - // Note: rocprim::block_sort with an algorithm that is not stable_merge_sort does not implement sorting - // a misaligned amount of items. - sort_type().sort(stable_keys, storage.sort, stable_oob_compare_function); - - unsigned int ranks[ItemsPerThread]; - ROCPRIM_UNROLL - for(unsigned int i = 0; i < ItemsPerThread; ++i) - { - keys[i] = rocprim::get<0>(stable_keys[i]); - ranks[i] = rocprim::get<1>(stable_keys[i]); - } - - syncthreads(); - keys_store_type().store(keys_output, keys, valid_in_last_block, storage.store_keys); - values_permute_type().permute(ranks, - values_input, - values_output, - valid_in_last_block, - storage.permute_values); - } - else - { - sort_type().sort(stable_keys, storage.sort, stable_compare_function); - - unsigned int ranks[ItemsPerThread]; - ROCPRIM_UNROLL - for(unsigned int i = 0; i < ItemsPerThread; ++i) - { - keys[i] = rocprim::get<0>(stable_keys[i]); - ranks[i] = rocprim::get<1>(stable_keys[i]); - } - - syncthreads(); - keys_store_type().store(keys_output, keys, storage.store_keys); - values_permute_type().permute(ranks, - values_input, - values_output, - storage.permute_values); - } - } -}; +struct block_sort_impl; template -struct block_sort_impl +struct block_sort_impl { using keys_load_type = block_load; @@ -518,7 +386,6 @@ struct block_sort_impl> { using keys_load_type @@ -599,7 +466,6 @@ struct block_sort_impl sizeof(int))>> { using keys_load_type @@ -677,9 +543,8 @@ struct block_sort_impl; + using sort_impl = block_sort_impl; ROCPRIM_SHARED_MEMORY typename sort_impl::storage_type storage; diff --git a/rocprim/include/rocprim/device/detail/device_radix_sort.hpp b/rocprim/include/rocprim/device/detail/device_radix_sort.hpp index 2d8800876..bbcff597b 100644 --- a/rocprim/include/rocprim/device/detail/device_radix_sort.hpp +++ b/rocprim/include/rocprim/device/detail/device_radix_sort.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -163,20 +163,16 @@ struct radix_digit_count_helper const bit_key_type bit_key = key_codec::encode(keys[i]); const unsigned int digit = key_codec::extract_digit(bit_key, bit, current_radix_bits); const unsigned int pos = i * BlockSize + flat_id; - lane_mask_type same_digit_lanes_mask = ::rocprim::ballot(IsFull || (pos < valid_count)); - for(unsigned int b = 0; b < RadixBits; b++) - { - const unsigned int bit_set = digit & (1u << b); - const lane_mask_type bit_set_mask = ::rocprim::ballot(bit_set); - same_digit_lanes_mask &= (bit_set ? bit_set_mask : ~bit_set_mask); - } - const unsigned int same_digit_count = ::rocprim::bit_count(same_digit_lanes_mask); - const unsigned int prev_same_digit_count = ::rocprim::masked_bit_count(same_digit_lanes_mask); - if(prev_same_digit_count == 0) + + lane_mask_type same_digit_lanes_mask + = ::rocprim::match_any(digit, IsFull || (pos < valid_count)); + + if(::rocprim::group_elect(same_digit_lanes_mask)) { // Write the number of lanes having this digit, // if the current lane is the first (and maybe only) lane with this digit. - storage.digit_counts[warp_id][digit] += same_digit_count; + storage.digit_counts[warp_id][digit] + += ::rocprim::bit_count(same_digit_lanes_mask); } } } @@ -1194,17 +1190,18 @@ template -ROCPRIM_DEVICE void onesweep_iteration(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - const unsigned int size, - Offset* global_digit_offsets_in, - Offset* global_digit_offsets_out, - onesweep_lookback_state* lookback_states, - const unsigned int bit, - const unsigned int current_radix_bits, - const unsigned int full_blocks) +ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void + onesweep_iteration(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + const unsigned int size, + Offset* global_digit_offsets_in, + Offset* global_digit_offsets_out, + onesweep_lookback_state* lookback_states, + const unsigned int bit, + const unsigned int current_radix_bits, + const unsigned int full_blocks) { using key_type = typename std::iterator_traits::value_type; using value_type = typename std::iterator_traits::value_type; diff --git a/rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp b/rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp index 3fd9f8a65..16bc5b866 100644 --- a/rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp +++ b/rocprim/include/rocprim/device/detail/device_reduce_by_key.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -31,6 +31,7 @@ #include "../../detail/match_result_type.hpp" #include "../../detail/various.hpp" #include "../../intrinsics/thread.hpp" +#include "../../thread/thread_operators.hpp" #include "../../config.hpp" @@ -59,6 +60,36 @@ template using lookback_scan_state_t = detail::lookback_scan_state, UseSleep>; +template +struct guarded_inequality_wrapper +{ + /// Wrapped equality operator + EqualityOp op; + + /// Out-of-bounds limit + size_t guard; + + /// Constructor + ROCPRIM_HOST_DEVICE inline guarded_inequality_wrapper(EqualityOp op, size_t guard) + : op(op), guard(guard) + {} + + /// \brief Guarded boolean inequality operator. + /// + /// \tparam T Type of the operands compared by the equality operator + /// \param a Left hand-side operand + /// \param b Right hand-side operand + /// \param idx Index of the thread calling to this operator. This is used to determine which + /// operations are out-of-bounds + /// \returns !op(a, b) for a certain equality operator \p op when in-bounds. + template + ROCPRIM_HOST_DEVICE inline bool operator()(const T& a, const T& b, size_t idx) const + { + // In-bounds return operation result, out-of-bounds return false. + return (idx < guard) ? !op(a, b) : 0; + } +}; + template(compare, remaining); + + if(!is_global_first_tile) + { + const KeyType tile_predecessor = tile_keys[-1]; + block_discontinuity_type{}.flag_heads(head_flags, + tile_predecessor, + keys, + guarded_not_equal, + storage); + } + else + { + block_discontinuity_type{}.flag_heads(head_flags, keys, guarded_not_equal, storage); + } } else { - block_discontinuity_type{}.flag_heads(head_flags, keys, not_equal, storage); + auto not_equal = rocprim::inequality_wrapper(compare); + + if(!is_global_first_tile) + { + const KeyType tile_predecessor = tile_keys[-1]; + block_discontinuity_type{}.flag_heads(head_flags, + tile_predecessor, + keys, + not_equal, + storage); + } + else + { + block_discontinuity_type{}.flag_heads(head_flags, keys, not_equal, storage); + } } } }; @@ -270,8 +320,11 @@ class tile_helper // first tile in this launch const bool is_first_tile = tile_id == 0; + // When in last tile valid_in_global_last_tile = remaining const unsigned int valid_in_global_last_tile = static_cast(size - ((total_number_of_tiles - 1) * items_per_tile)); + const size_t remaining + = static_cast(size - (size_t{global_tile_id} * items_per_tile)); const unsigned int flat_thread_id = threadIdx.x; @@ -293,6 +346,8 @@ class tile_helper compare, head_flags, is_global_first_tile, + is_global_last_tile, + remaining, storage.scan.flags); wrapped_type wrapped_values[ItemsPerThread]; diff --git a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp index f01f7347a..9287e5b21 100644 --- a/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp +++ b/rocprim/include/rocprim/device/detail/lookback_scan_state.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -35,6 +35,8 @@ #include "../../detail/temp_storage.hpp" #include "../../detail/various.hpp" +#include "../config_types.hpp" + extern "C" { void __builtin_amdgcn_s_sleep(int); @@ -98,26 +100,37 @@ struct lookback_scan_state using value_type = T; // temp_storage must point to allocation of get_storage_size(number_of_blocks) bytes - ROCPRIM_HOST static inline - lookback_scan_state create(void* temp_storage, const unsigned int number_of_blocks) + ROCPRIM_HOST static inline hipError_t create(lookback_scan_state& state, + void* temp_storage, + const unsigned int number_of_blocks, + const hipStream_t /*stream*/) { - (void) number_of_blocks; - lookback_scan_state state; + (void)number_of_blocks; state.prefixes = reinterpret_cast(temp_storage); - return state; + return hipSuccess; } - ROCPRIM_HOST static inline - size_t get_storage_size(const unsigned int number_of_blocks) + ROCPRIM_HOST static inline hipError_t get_storage_size(const unsigned int number_of_blocks, + const hipStream_t stream, + size_t& storage_size) { - return sizeof(prefix_underlying_type) * (::rocprim::host_warp_size() + number_of_blocks); + unsigned int warp_size; + hipError_t error = ::rocprim::host_warp_size(stream, warp_size); + + storage_size = sizeof(prefix_underlying_type) * (warp_size + number_of_blocks); + + return error; } - ROCPRIM_HOST static inline detail::temp_storage::layout - get_temp_storage_layout(const unsigned int number_of_blocks) + ROCPRIM_HOST static inline hipError_t + get_temp_storage_layout(const unsigned int number_of_blocks, + const hipStream_t stream, + detail::temp_storage::layout& layout) { - return detail::temp_storage::layout{get_storage_size(number_of_blocks), - alignof(prefix_underlying_type)}; + size_t storage_size = 0; + hipError_t error = get_storage_size(number_of_blocks, stream, storage_size); + layout = detail::temp_storage::layout{storage_size, alignof(prefix_underlying_type)}; + return error; } ROCPRIM_DEVICE ROCPRIM_INLINE @@ -238,11 +251,15 @@ struct lookback_scan_state using value_type = T; // temp_storage must point to allocation of get_storage_size(number_of_blocks) bytes - ROCPRIM_HOST static inline - lookback_scan_state create(void* temp_storage, const unsigned int number_of_blocks) + ROCPRIM_HOST static inline hipError_t create(lookback_scan_state& state, + void* temp_storage, + const unsigned int number_of_blocks, + const hipStream_t stream) { - const auto n = ::rocprim::host_warp_size() + number_of_blocks; - lookback_scan_state state; + unsigned int warp_size; + hipError_t error = ::rocprim::host_warp_size(stream, warp_size); + + const auto n = warp_size + number_of_blocks; auto ptr = static_cast(temp_storage); @@ -253,23 +270,31 @@ struct lookback_scan_state ptr += ::rocprim::detail::align_size(n * sizeof(T)); state.prefixes_complete_values = reinterpret_cast(ptr); - return state; + return error; } - ROCPRIM_HOST static inline - size_t get_storage_size(const unsigned int number_of_blocks) + ROCPRIM_HOST static inline hipError_t get_storage_size(const unsigned int number_of_blocks, + const hipStream_t stream, + size_t& storage_size) { - const auto n = ::rocprim::host_warp_size() + number_of_blocks; - size_t size = ::rocprim::detail::align_size(n * sizeof(flag_type)); - size += 2 * ::rocprim::detail::align_size(n * sizeof(T)); - return size; + unsigned int warp_size; + hipError_t error = ::rocprim::host_warp_size(stream, warp_size); + const auto n = warp_size + number_of_blocks; + storage_size = ::rocprim::detail::align_size(n * sizeof(flag_type)); + storage_size += 2 * ::rocprim::detail::align_size(n * sizeof(T)); + return error; } - ROCPRIM_HOST static inline detail::temp_storage::layout - get_temp_storage_layout(const unsigned int number_of_blocks) + ROCPRIM_HOST static inline hipError_t + get_temp_storage_layout(const unsigned int number_of_blocks, + const hipStream_t stream, + detail::temp_storage::layout& layout) { + size_t storage_size = 0; size_t alignment = std::max(alignof(flag_type), alignof(T)); - return detail::temp_storage::layout{get_storage_size(number_of_blocks), alignment}; + hipError_t error = get_storage_size(number_of_blocks, stream, storage_size); + layout = detail::temp_storage::layout{storage_size, alignment}; + return error; } ROCPRIM_DEVICE ROCPRIM_INLINE diff --git a/rocprim/include/rocprim/device/device_adjacent_difference.hpp b/rocprim/include/rocprim/device/device_adjacent_difference.hpp index 09c418504..917bf4328 100644 --- a/rocprim/include/rocprim/device/device_adjacent_difference.hpp +++ b/rocprim/include/rocprim/device/device_adjacent_difference.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -71,22 +71,28 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template -void ROCPRIM_KERNEL __launch_bounds__(Config::block_size) adjacent_difference_kernel( - const InputIt input, - const OutputIt output, - const std::size_t size, - const BinaryFunction op, - const typename std::iterator_traits::value_type* previous_values, - const std::size_t starting_block) +template +void ROCPRIM_KERNEL + __launch_bounds__(device_params().adjacent_difference_kernel_config.block_size) + adjacent_difference_kernel( + const InputIt input, + const OutputIt output, + const std::size_t size, + const BinaryFunction op, + const typename std::iterator_traits::value_type* previous_values, + const std::size_t starting_block) { - adjacent_difference_kernel_impl( - input, output, size, op, previous_values, starting_block); + adjacent_difference_kernel_impl(input, + output, + size, + op, + previous_values, + starting_block); } template ::value_type; - using config = detail::default_or_custom_config< - Config, - detail::default_adjacent_difference_config>; + using config = wrapped_adjacent_difference_config; + + detail::target_arch target_arch; + hipError_t result = detail::host_target_arch(stream, target_arch); + if(result != hipSuccess) + { + return result; + } - static constexpr unsigned int block_size = config::block_size; - static constexpr unsigned int items_per_thread = config::items_per_thread; - static constexpr unsigned int items_per_block = block_size * items_per_thread; + const detail::adjacent_difference_config_params params + = detail::dispatch_target_arch(target_arch); - const std::size_t num_blocks = ceiling_div(size, items_per_block); - const std::size_t num_previous_values = InPlace && num_blocks >= 2 ? num_blocks - 1 : 0; + const unsigned int block_size = params.adjacent_difference_kernel_config.block_size; + const unsigned int items_per_thread = params.adjacent_difference_kernel_config.items_per_thread; + const unsigned int items_per_block = block_size * items_per_thread; + const std::size_t num_blocks = ceiling_div(size, items_per_block); + const std::size_t num_previous_values = InPlace && num_blocks >= 2 ? num_blocks - 1 : 0; value_type* previous_values; @@ -139,11 +152,11 @@ hipError_t adjacent_difference_impl(void* const temporary_storage, { // If doing left adjacent diff then the last item of each block is needed for the // next block, otherwise the first item is needed for the previous block - static constexpr auto offset = items_per_block - (Right ? 0 : 1); + const auto offset = items_per_block - (Right ? 0 : 1); const auto block_starts_iter = make_transform_iterator( - rocprim::make_counting_iterator(std::size_t {0}), - [base = input + offset](std::size_t i) { return base[i * items_per_block]; }); + rocprim::make_counting_iterator(std::size_t{0}), + [=, base = input + offset](std::size_t i) { return base[i * items_per_block]; }); const hipError_t error = ::rocprim::transform(block_starts_iter, previous_values, @@ -157,9 +170,9 @@ hipError_t adjacent_difference_impl(void* const temporary_storage, } } - static constexpr unsigned int size_limit = config::size_limit; - static constexpr auto number_of_blocks_limit = std::max(size_limit / items_per_block, 1u); - static constexpr auto aligned_size_limit = number_of_blocks_limit * items_per_block; + const unsigned int size_limit = params.adjacent_difference_kernel_config.size_limit; + const auto number_of_blocks_limit = std::max(size_limit / items_per_block, 1u); + const auto aligned_size_limit = number_of_blocks_limit * items_per_block; // Launch number_of_blocks_limit blocks while there is still at least as many blocks // left as the limit @@ -210,7 +223,7 @@ hipError_t adjacent_difference_impl(void* const temporary_storage, } } // namespace detail -#undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR + #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR #endif // DOXYGEN_SHOULD_SKIP_THIS @@ -231,8 +244,8 @@ hipError_t adjacent_difference_impl(void* const temporary_storage, /// } /// \endcode /// -/// \tparam Config - [optional] configuration of the primitive. It can be -/// `adjacent_difference_config` or a class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be +/// `adjacent_difference_config` or a class derived from it. /// \tparam InputIt - [inferred] random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam OutputIt - [inferred] random-access iterator type of the output range. Must meet the @@ -327,8 +340,8 @@ hipError_t adjacent_difference(void* const temporary_storage, /// } /// \endcode /// -/// \tparam Config - [optional] configuration of the primitive. It can be -/// `adjacent_difference_config` or a class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be +/// `adjacent_difference_config` or a class derived from it. /// \tparam InputIt - [inferred] random-access iterator type of the value range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam BinaryFunction - [inferred] binary operation function object that will be applied to @@ -380,8 +393,8 @@ hipError_t adjacent_difference_inplace(void* const temporary_storage, /// } /// \endcode /// -/// \tparam Config - [optional] configuration of the primitive. It can be -/// `adjacent_difference_config` or a class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be +/// `adjacent_difference_config` or a class derived from it. /// \tparam InputIt - [inferred] random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam OutputIt - [inferred] random-access iterator type of the output range. Must meet the @@ -476,8 +489,8 @@ hipError_t adjacent_difference_right(void* const temporary_storage, /// } /// \endcode /// -/// \tparam Config - [optional] configuration of the primitive. It can be -/// `adjacent_difference_config` or a class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be +/// `adjacent_difference_config` or a class derived from it. /// \tparam InputIt - [inferred] random-access iterator type of the value range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam BinaryFunction - [inferred] binary operation function object that will be applied to diff --git a/rocprim/include/rocprim/device/device_adjacent_difference_config.hpp b/rocprim/include/rocprim/device/device_adjacent_difference_config.hpp index 804e7d20b..0299484f1 100644 --- a/rocprim/include/rocprim/device/device_adjacent_difference_config.hpp +++ b/rocprim/include/rocprim/device/device_adjacent_difference_config.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -28,6 +28,9 @@ #include "../functional.hpp" #include "config_types.hpp" +#include "detail/config/device_adjacent_difference.hpp" +#include "detail/config/device_adjacent_difference_inplace.hpp" +#include "detail/device_config_helper.hpp" #include "../block/block_load.hpp" #include "../block/block_store.hpp" @@ -37,44 +40,62 @@ BEGIN_ROCPRIM_NAMESPACE -/// \brief Configuration of device-level adjacent_difference primitives. -/// -/// \tparam BlockSize - number of threads in a block. -/// \tparam ItemsPerThread - number of items processed by each thread -/// \tparam LoadMethod - method for loading input values -/// \tparam StoreMethod - method for storing values -/// \tparam SizeLimit - limit on the number of items for a single adjacent_difference kernel launch. -/// Larger input sizes will be broken up to multiple kernel launches. -template -struct adjacent_difference_config : kernel_config -{ - static constexpr block_load_method load_method = LoadMethod; ///< input values are loaded using this method - static constexpr block_store_method store_method = StoreMethod; ///< input values are stored using this method -}; - namespace detail { -template -struct adjacent_difference_config_fallback +// Specialization for user provided configuration +template +struct wrapped_adjacent_difference_config { - static constexpr unsigned int item_scale - = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + static_assert( + std::is_same::value, + "Config must be a specialization of struct template adjacent_difference_config"); + + template + struct architecture_config + { + static constexpr adjacent_difference_config_params params = AdjacentDifferenceConfig{}; + }; +}; - using type = adjacent_difference_config<256, ::rocprim::max(1u, 16u / item_scale)>; +// Specialization for selecting the default configuration for in place +template +struct wrapped_adjacent_difference_config +{ + template + struct architecture_config + { + static constexpr adjacent_difference_config_params params + = default_adjacent_difference_inplace_config(Arch), Value>{}; + }; }; -template -struct default_adjacent_difference_config - : select_arch> +// Specialization for selecting the default configuration for out of place +template +struct wrapped_adjacent_difference_config { + template + struct architecture_config + { + static constexpr adjacent_difference_config_params params + = default_adjacent_difference_config(Arch), Value>{}; + }; }; -} // end namespace detail +#ifndef DOXYGEN_SHOULD_SKIP_THIS +template +template +constexpr adjacent_difference_config_params + wrapped_adjacent_difference_config::architecture_config< + Arch>::params; +template +template +constexpr adjacent_difference_config_params + wrapped_adjacent_difference_config::architecture_config< + Arch>::params; +#endif // DOXYGEN_SHOULD_SKIP_THIS + +} // namespace detail END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/device/device_binary_search.hpp b/rocprim/include/rocprim/device/device_binary_search.hpp index 2f6a0146d..395f5ab19 100644 --- a/rocprim/include/rocprim/device/device_binary_search.hpp +++ b/rocprim/include/rocprim/device/device_binary_search.hpp @@ -31,11 +31,11 @@ #include "device_binary_search_config.hpp" #include "device_transform.hpp" -BEGIN_ROCPRIM_NAMESPACE - /// \addtogroup devicemodule /// @{ +BEGIN_ROCPRIM_NAMESPACE + namespace detail { @@ -83,45 +83,117 @@ hipError_t binary_search(void * temporary_storage, ); } +template +struct is_default_or_has_tag +{ + static constexpr bool value + = std::integral_constant::value>::value; +}; + +template +struct is_default_or_has_tag +{ + static constexpr bool value = true; +}; + } // end of detail namespace -/// \brief Performs a device-level lower bound check. +/// \brief Parallel primitive that uses binary search for computing a lower bound on a given ordered +/// range for each element of a given input. +/// +/// The `lower_bound` function determines for each element `e` of a given input the greatest index +/// `i` in a given ordered range `haystack` such that `!compare_op(e, haystack[i])` is +/// `true.` +/// It uses the search function `detail::lower_bound_search_op,` which in turn uses a binary +/// operator `compare_op` for comparing the given value with the haystack ones. /// /// \par Overview -/// Runs multiple lower bound checks in parallel (one for each \p needle in needles). -/// A lower bound check returns the index of the first element in \p haystack that -/// causes \p compare_op(element,needle) to return false. If no item in \p haystack satisfies -/// this criteria, then \p haystack_size is returned. -/// Results are written by \p output. +/// * When a null pointer is passed as `temporary_storage,` the required allocation size (in bytes) +/// is written to `storage_size` and the function returns without performing the search operation. +/// * If used along with `rocprim::upper_bound,` the ith element of the given input must be located +/// in the semi-open interval `[lower_output[i], upper_output[i])` of `haystack,` in case of +/// being present at all. /// -/// \tparam Config - [optional] configuration information for the primitive. This can be -/// \p lower_bound_config or a custom class with the same members. -/// \tparam HaystackIterator - Iterator type for items we'll be searching through (values). -/// \tparam NeedlesIterator - Iterator type for items we are performing lower bound checks -/// for (keys). -/// \tparam OutputIterator - Iterator type for the output indices. -/// \tparam CompareFunction [optional] A callable that can be used to compare two values. -/// defaults to rocprim::less. +/// \tparam Config - [optional] Configuration of the primitive. It has to be `lower_bound_config` or +/// a class derived from it. Default is `default_config.` +/// \tparam HaystackIterator - [inferred] Random-access iterator type of the search range. Must meet +/// the requirements of a C++ InputIterator concept. It can be a simple pointer type. +/// \tparam NeedlesIterator - [inferred] Random-access iterator type of the input range. Must meet +/// the requirements of a C++ InputIterator concept. It can be a simple pointer type. Elements of +/// the type pointed by it must be comparable to elements of the type pointed by HaystackIterator +/// as either operand of `compare_op.` +/// \tparam OutputIterator - [inferred] Random-access iterator type of the output range. Must meet +/// the requirements of a C++ OutputIterator concept. It can be a simple pointer type. +/// \tparam CompareFunction - [inferred] Type of binary function that accepts two arguments of the +/// types pointed by `HaystackIterator` and `NeedlesIterator,` and returns a value convertible +/// to bool. Default type is `::rocprim::less<>.` +/// \param [in] temporary_storage - Pointer to a device-accessible temporary storage. +/// \param [in,out] storage_size - Reference to the size (in bytes) of `temporary_storage.` +/// \param [in] haystack - Iterator to the first element in the search range. Elements of this +/// range must be sorted. +/// \param [in] needles - Iterator to the first element in the range of values to search for on +/// `haystack.` +/// \param [out] output - Iterator to the first element in the output range. +/// \param [in] haystack_size - Number of elements in the search range `haystack.` +/// \param [in] needles_size - Number of elements in the input range `needles.` +/// \param [in] compare_op - Binary operation function object that is used to compare values. The +/// signature of the function should be equivalent to the following: +/// `bool f(const T &a, const U &b);`. It does not need to have `const &`, but the +/// function object must not modify the objects passed to it. Default is `CompareFunction().` +/// \param [in] stream - [optional] HIP stream object. Default is `0` (default stream). +/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel launch is +/// forced in order to check for errors. +/// \return `hipSuccess` (`0)` after a successful search; otherwise a HIP runtime error of +/// type `hipError_t.` /// -/// \param [in] temporary_storage - pointer to device-accessible temporary storage. When -/// a null pointer is passed, the required allocation size (in bytes) is written to -/// \p storage_size and the function returns without performing the search operation. -/// \param [in,out] storage_size - reference to the size (in bytes) of \p temporary_storage. -/// \param haystack [in] - iterator pointing to the beginning of the range to search through. -/// \param needles [in] - iterator pointing to the first of the elements to perform lower -/// bound checks on. -/// \param output [out] - Iterator pointing to the beginning of the range where the results -/// are to be stored. -/// \param haystack_size [in] - the total number of values to search through. -/// \param needles_size [in] - the total number of keys to perform lower bound checks for. -/// \param compare_op [in] - binary operation function that will be used for comparison. -/// The signature of the function should be equivalent to the following: -/// bool f(const T &a, const T &b);. The signature does not need to have -/// const &, but the function object must not modify the objects passed to it. -/// The default value is \p CompareFunction(). -/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream). -/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel -/// launch is forced in order to check for errors. Default value is \p false. +/// \par Example +/// \parblock +/// In this example a device-level lower bound computation on a haystack of double precision type +/// values is performed on an input array of integer values. +/// +/// \code{.cpp} +/// #include +/// +/// // Prepare input and output (declare pointers, allocate device memory etc.). +/// size_t haystack_size; // e.g. 7 +/// double * haystack; // e.g. {0, 1.5, 3, 4.5, 6, 7.5, 9} +/// size_t needles_size; // e.g. 5 +/// int * needles; // e.g. {1, 2, 3, 4, 5} +/// compare_op_type compare_op; // e.g. compare_op_type = rocprim::less<> +/// size_t * output; // empty array of needles_size elements +/// +/// // Get required size of the temporary storage. +/// void * temporary_storage = nullptr; +/// size_t temporary_storage_bytes; +/// rocprim::lower_bound(temporary_storage, +/// temporary_storage_bytes, +/// haystack, +/// needles, +/// output, +/// haystack_size, +/// needles_size, +/// compare_op, +/// stream, +/// debug_synchronous); +/// +/// // Allocate temporary storage. +/// hipMalloc(&temporary_storage, temporary_storage_bytes); +/// +/// // Perform binary search. +/// rocprim::lower_bound(temporary_storage, +/// temporary_storage_bytes, +/// haystack, +/// needles, +/// output, +/// haystack_size, +/// needles_size, +/// compare_op, +/// stream, +/// debug_synchronous); +/// +/// // output = {0, 1, 2, 2, 3} +/// \endcode +/// \endparblock template< class Config = default_config, class HaystackIterator, @@ -141,6 +213,9 @@ hipError_t lower_bound(void * temporary_storage, hipStream_t stream = 0, bool debug_synchronous = false) { + static_assert(detail::is_default_or_has_tag::value, + "Config must be a specialization of struct template lower_bound_config"); + using value_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; using config @@ -161,43 +236,102 @@ hipError_t lower_bound(void * temporary_storage, debug_synchronous); } -/// \brief Performs a device-level upper bound check. +/// \brief Parallel primitive that uses binary search for computing an upper bound on a given ordered +/// range for each element of a given input. +/// +/// The `upper_bound` function determines for each element `e` of a given input the lowest index +/// `i` in a given ordered range `haystack` such that `compare_op(e, haystack[i])` is +/// `true.` +/// It uses the search function `detail::upper_bound_search_op,` which in turn uses a binary +/// operator `compare_op` for comparing the input values with the haystack ones. /// /// \par Overview -/// Runs multiple upper bound checks in parallel (one for each \p needle in needles). -/// An upper bound check returns the index of the first element in \p haystack that -/// causes \p compare_op(needle,element) to return true. If no item in \p haystack satisfies -/// this criteria, then \p haystack_size is returned. -/// Results are written by \p output. +/// * When a null pointer is passed as `temporary_storage,` the required allocation size (in bytes) +/// is written to `storage_size` and the function returns without performing the search operation. +/// * If used along with `rocprim::lower_bound,` the ith element of the given input must be located +/// in the semi-open interval `[lower_output[i], upper_output[i])` of `haystack,` in case of +/// being present at all. /// -/// \tparam Config - [optional] configuration information for the primitive. This can be -/// \p upper_bound_config or a custom class with the same members. -/// \tparam HaystackIterator - Iterator type for items we'll be searching through (values). -/// \tparam NeedlesIterator - Iterator type for items we are performing upper bound checks -/// for (keys). -/// \tparam OutputIterator - Iterator type for the output indices. -/// \tparam CompareFunction [optional] A callable that can be used to compare two values. -/// defaults to rocprim::less. +/// \tparam Config - [optional] Configuration of the primitive. It can be `upper_bound_config` or +/// a class derived from it. Default is `default_config.` +/// \tparam HaystackIterator - [inferred] Random-access iterator type of the search range. Must meet +/// the requirements of a C++ InputIterator concept. It can be a simple pointer type. +/// \tparam NeedlesIterator - [inferred] Random-access iterator type of the input range. Must meet +/// the requirements of a C++ InputIterator concept. It can be a simple pointer type. Elements of +/// the type pointed by it must be comparable to elements of the type pointed by HaystackIterator +/// as either operand of `compare_op.` +/// \tparam OutputIterator - [inferred] Random-access iterator type of the output range. Must meet +/// the requirements of a C++ OutputIterator concept. It can be a simple pointer type. +/// \tparam CompareFunction - [inferred] Type of binary function that accepts two arguments of the +/// types pointed by `HaystackIterator` and `NeedlesIterator,` and returns a value convertible +/// to bool. Default type is `::rocprim::less<>.` +/// \param [in] temporary_storage - Pointer to a device-accessible temporary storage. +/// \param [in,out] storage_size - Reference to the size (in bytes) of `temporary_storage.` +/// \param [in] haystack - Iterator to the first element in the search range. Elements of this +/// range must be sorted. +/// \param [in] needles - Iterator to the first element in the range of values to search for on +/// `haystack.` +/// \param [out] output - Iterator to the first element in the output range. +/// \param [in] haystack_size - Number of elements in the search range `haystack.` +/// \param [in] needles_size - Number of elements in the input range `needles.` +/// \param [in] compare_op - Binary operation function object that is used to compare values. The +/// signature of the function should be equivalent to the following: +/// `bool f(const T &a, const U &b);`. It does not need to have `const &`, but the +/// function object must not modify the objects passed to it. Default is `CompareFunction().` +/// \param [in] stream - [optional] HIP stream object. Default is `0` (default stream). +/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel launch is +/// forced in order to check for errors. +/// \return `hipSuccess` (`0)` after a successful search; otherwise a HIP runtime error of +/// type `hipError_t.` /// -/// \param [in] temporary_storage - pointer to device-accessible temporary storage. When -/// a null pointer is passed, the required allocation size (in bytes) is written to -/// \p storage_size and the function returns without performing the search operation. -/// \param [in,out] storage_size - reference to the size (in bytes) of \p temporary_storage. -/// \param haystack [in] - iterator pointing to the beginning of the range to search through. -/// \param needles [in] - iterator pointing to the first of the elements to perform upper -/// bound checks on. -/// \param output [out] - Iterator pointing to the beginning of the range where the results -/// are to be stored. -/// \param haystack_size [in] - the total number of values to search through. -/// \param needles_size [in] - the total number of keys to perform upper bound checks for. -/// \param compare_op [in] - binary operation function that will be used for comparison. -/// The signature of the function should be equivalent to the following: -/// bool f(const T &a, const T &b);. The signature does not need to have -/// const &, but the function object must not modify the objects passed to it. -/// The default value is \p CompareFunction(). -/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream). -/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel -/// launch is forced in order to check for errors. Default value is \p false. +/// \par Example +/// \parblock +/// In this example a device-level upper bound computation on a haystack of double precision type +/// values is performed on an input array of integer values. +/// +/// \code{.cpp} +/// #include +/// +/// // Prepare input and output (declare pointers, allocate device memory etc.). +/// size_t haystack_size; // e.g. 7 +/// double * haystack; // e.g. {0, 1.5, 3, 4.5, 6, 7.5, 9} +/// size_t needles_size; // e.g. 5 +/// int * needles; // e.g. {1, 2, 3, 4, 5} +/// compare_op_type compare_op; // e.g. compare_op_type = rocprim::less<> +/// size_t * output; // empty array of needles_size elements +/// +/// // Get required size of the temporary storage. +/// void * temporary_storage = nullptr; +/// size_t temporary_storage_bytes; +/// rocprim::upper_bound(temporary_storage, +/// temporary_storage_bytes, +/// haystack, +/// needles, +/// output, +/// haystack_size, +/// needles_size, +/// compare_op, +/// stream, +/// debug_synchronous); +/// +/// // Allocate temporary storage. +/// hipMalloc(&temporary_storage, temporary_storage_bytes); +/// +/// // Perform binary search. +/// rocprim::upper_bound(temporary_storage, +/// temporary_storage_bytes, +/// haystack, +/// needles, +/// output, +/// haystack_size, +/// needles_size, +/// compare_op, +/// stream, +/// debug_synchronous); +/// +/// // output = {1, 2, 3, 3, 4} +/// \endcode +/// \endparblock template< class Config = default_config, class HaystackIterator, @@ -217,6 +351,8 @@ hipError_t upper_bound(void * temporary_storage, hipStream_t stream = 0, bool debug_synchronous = false) { + static_assert(detail::is_default_or_has_tag::value, + "Config must be a specialization of struct template upper_bound_config"); using value_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; using config @@ -237,39 +373,97 @@ hipError_t upper_bound(void * temporary_storage, debug_synchronous); } -/// \brief Performs a device-level parallel binary search. +/// \brief Parallel primitive for performing a binary search (on a sorted range) of a given input. +/// +/// The `binary_search` function determines for each element of a given input if it's present +/// in a given ordered range `haystack`. It uses the search function `detail::binary_search_op` +/// which in turn uses a binary operator `compare_op` for comparing the input values with the +/// haystack ones. /// /// \par Overview -/// Runs multiple binary searches in parallel. The result is a sequence of bools, -/// where each bool indicates if the corresponding search succeeded (the key was found) -/// or not. Results are written by \p output. +/// * When a null pointer is passed as `temporary_storage`, the required allocation size (in bytes) +/// is written to `storage_size` and the function returns without performing the search operation. +/// +/// \tparam Config - [optional] Configuration of the primitive. It can be `binary_search_config` or +/// a class derived from it. Default is `default_config`. +/// \tparam HaystackIterator - [inferred] Random-access iterator type of the search range. Must meet +/// the requirements of a C++ InputIterator concept. It can be a simple pointer type. +/// \tparam NeedlesIterator - [inferred] Random-access iterator type of the input range. Must meet +/// the requirements of a C++ InputIterator concept. It can be a simple pointer type. Elements of +/// the type pointed by it must be comparable to elements of the type pointed by `HaystackIterator` +/// as either operand of `compare_op`. +/// \tparam OutputIterator - [inferred] Random-access iterator type of the output range. Must meet +/// the requirements of a C++ OutputIterator concept. It can be a simple pointer type. +/// \tparam CompareFunction - [inferred] Type of binary function that accepts two arguments of the +/// types pointed by `HaystackIterator` and `NeedlesIterator`, and returns a value convertible to +/// bool. Default type is `::rocprim::less<>`. +/// \param [in] temporary_storage - Pointer to a device-accessible temporary storage. +/// \param [in,out] storage_size - Reference to the size (in bytes) of `temporary_storage`. +/// \param [in] haystack - Iterator to the first element in the search range. Elements of this +/// range must be sorted. +/// \param [in] needles - Iterator to the first element in the range of values to search for on +/// `haystack`. +/// \param [out] output - Iterator to the first element in the output range of boolean values. +/// \param [in] haystack_size - Number of elements in the search range `haystack`. +/// \param [in] needles_size - Number of elements in the input range `needles`. +/// \param [in] compare_op - Binary operation function object that is used to compare values. The +/// signature of the function should be equivalent to the following: +/// `bool f(const T &a, const U &b);`. It does not need to have `const &`, but the +/// function object must not modify the objects passed to it. Default is `CompareFunction()`. +/// \param [in] stream - [optional] HIP stream object. Default is `0` (default stream). +/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel launch is +/// forced in order to check for errors. +/// \return `hipSuccess` (`0`) after a successful search; otherwise a HIP runtime error of +/// type `hipError_t`. +/// +/// \par Example +/// \parblock +/// In this example a device-level binary search on a haystack of integer values is performed on an +/// input array of integer values too. /// -/// \tparam Config - [optional] configuration information for the primitive. This can be -/// \p binary_search_config or a custom class with the same members. -/// \tparam HaystackIterator - Iterator type for items we'll be searching through (values). -/// \tparam NeedlesIterator - Iterator type for item we are searching for (keys). -/// \tparam OutputIterator - Iterator type for the output bools. -/// \tparam CompareFunction [optional] A callable that can be used to compare two values. -/// defaults to rocprim::less. +/// \code{.cpp} +/// #include /// -/// \param [in] temporary_storage - pointer to device-accessible temporary storage. When -/// a null pointer is passed, the required allocation size (in bytes) is written to -/// \p storage_size and the function returns without performing the search operation. -/// \param [in,out] storage_size - reference to the size (in bytes) of \p temporary_storage. -/// \param haystack [in] - iterator pointing to the beginning of the range to search through. -/// \param needles [in] - iterator pointing to the first of the elements to find. -/// \param output [out] - Iterator pointing to the beginning of the range where the results -/// are to be stored. -/// \param haystack_size [in] - the total number of values to search through. -/// \param needles_size [in] - the total number of keys to search for. -/// \param compare_op [in] - binary operation function that will be used for comparison. -/// The signature of the function should be equivalent to the following: -/// bool f(const T &a, const T &b);. The signature does not need to have -/// const &, but the function object must not modify the objects passed to it. -/// The default value is \p CompareFunction(). -/// \param [in] stream - [optional] HIP stream object. Default is \p 0 (default stream). -/// \param [in] debug_synchronous - [optional] If true, synchronization after every kernel -/// launch is forced in order to check for errors. Default value is \p false. +/// // Prepare input and output (declare pointers, allocate device memory etc.). +/// size_t haystack_size; // e.g. 10 +/// int * haystack; // e.g. {0, 1, 2, 3, 4, 5, 6, 7, 8, 9} +/// size_t needles_size; // e.g. 8 +/// int * needles; // e.g. {0, 2, 12, 4, 14, 6, 8, 10} +/// compare_op_type compare_op; // e.g. compare_op_type = rocprim::less +/// size_t * output; // empty array of needles_size elements +/// +/// // Get required size of the temporary storage. +/// void * temporary_storage = nullptr; +/// size_t temporary_storage_bytes; +/// rocprim::binary_search(temporary_storage, +/// temporary_storage_bytes, +/// haystack, +/// needles, +/// output, +/// haystack_size, +/// needles_size, +/// compare_op, +/// stream, +/// debug_synchronous); +/// +/// // Allocate temporary storage. +/// hipMalloc(&temporary_storage, temporary_storage_bytes); +/// +/// // Perform binary search. +/// rocprim::binary_search(temporary_storage, +/// temporary_storage_bytes, +/// haystack, +/// needles, +/// output, +/// haystack_size, +/// needles_size, +/// compare_op, +/// stream, +/// debug_synchronous); +/// +/// // output = {1, 1, 0, 1, 0, 1, 1, 0} +/// \endcode +/// \endparblock template< class Config = default_config, class HaystackIterator, @@ -289,6 +483,8 @@ hipError_t binary_search(void * temporary_storage, hipStream_t stream = 0, bool debug_synchronous = false) { + static_assert(detail::is_default_or_has_tag::value, + "Config must be a specialization of struct template binary_search_config"); using value_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; using config @@ -309,9 +505,9 @@ hipError_t binary_search(void * temporary_storage, debug_synchronous); } +END_ROCPRIM_NAMESPACE + /// @} // end of group devicemodule -END_ROCPRIM_NAMESPACE - #endif // ROCPRIM_DEVICE_DEVICE_BINARY_SEARCH_HPP_ diff --git a/rocprim/include/rocprim/device/device_binary_search_config.hpp b/rocprim/include/rocprim/device/device_binary_search_config.hpp index bf8b2f75c..7b4a968a0 100644 --- a/rocprim/include/rocprim/device/device_binary_search_config.hpp +++ b/rocprim/include/rocprim/device/device_binary_search_config.hpp @@ -56,8 +56,8 @@ struct wrapped_transform_config, template struct architecture_config { - static constexpr transform_config_params params = wrap_transform_config< - default_binary_search_config(Arch), Value, Output>>(); + static constexpr transform_config_params params + = default_binary_search_config(Arch), Value, Output>{}; }; }; @@ -67,8 +67,8 @@ struct wrapped_transform_config, U template struct architecture_config { - static constexpr transform_config_params params = wrap_transform_config< - default_upper_bound_config(Arch), Value, Output>>(); + static constexpr transform_config_params params + = default_upper_bound_config(Arch), Value, Output>{}; }; }; @@ -78,8 +78,8 @@ struct wrapped_transform_config, U template struct architecture_config { - static constexpr transform_config_params params = wrap_transform_config< - default_lower_bound_config(Arch), Value, Output>>(); + static constexpr transform_config_params params + = default_lower_bound_config(Arch), Value, Output>{}; }; }; diff --git a/rocprim/include/rocprim/device/device_histogram.hpp b/rocprim/include/rocprim/device/device_histogram.hpp index 47bd796c9..0e8ff3700 100644 --- a/rocprim/include/rocprim/device/device_histogram.hpp +++ b/rocprim/include/rocprim/device/device_histogram.hpp @@ -449,8 +449,7 @@ inline hipError_t histogram_range_impl(void* temporary_storage, /// * Returns the required size of \p temporary_storage in \p storage_size /// if \p temporary_storage in a null pointer. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config -/// (preferred) or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it. /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam Counter - integer type for histogram bin counters. @@ -553,8 +552,7 @@ inline hipError_t histogram_even(void* temporary_storage, /// * Returns the required size of \p temporary_storage in \p storage_size /// if \p temporary_storage in a null pointer. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config -/// (preferred) or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it. /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam Counter - integer type for histogram bin counters. @@ -665,8 +663,7 @@ inline hipError_t histogram_even(void* temporary_storage, /// /// \tparam Channels - number of channels interleaved in the input samples. /// \tparam ActiveChannels - number of channels being used for computing histograms. -/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config -/// (preferred) or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it. /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam Counter - integer type for histogram bin counters. @@ -778,8 +775,7 @@ inline hipError_t multi_histogram_even(void* temporary_storage, /// /// \tparam Channels - number of channels interleaved in the input samples. /// \tparam ActiveChannels - number of channels being used for computing histograms. -/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config -/// (preferred) or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it. /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam Counter - integer type for histogram bin counters. @@ -886,8 +882,7 @@ inline hipError_t multi_histogram_even(void* temporary_storage, /// * Returns the required size of \p temporary_storage in \p storage_size /// if \p temporary_storage in a null pointer. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config -/// (preferred) or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it. /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam Counter - integer type for histogram bin counters. @@ -984,8 +979,7 @@ inline hipError_t histogram_range(void* temporary_storage, /// * Returns the required size of \p temporary_storage in \p storage_size /// if \p temporary_storage in a null pointer. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config -/// (preferred) or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it. /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam Counter - integer type for histogram bin counters. @@ -1091,8 +1085,7 @@ inline hipError_t histogram_range(void* temporary_storage, /// /// \tparam Channels - number of channels interleaved in the input samples. /// \tparam ActiveChannels - number of channels being used for computing histograms. -/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config -/// (preferred) or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it. /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam Counter - integer type for histogram bin counters. @@ -1199,8 +1192,7 @@ inline hipError_t multi_histogram_range(void* temporary_storage, /// /// \tparam Channels - number of channels interleaved in the input samples. /// \tparam ActiveChannels - number of channels being used for computing histograms. -/// \tparam Config - [optional] configuration of the primitive. It can be \p histogram_config -/// (preferred) or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p histogram_config or a class derived from it. /// \tparam SampleIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam Counter - integer type for histogram bin counters. diff --git a/rocprim/include/rocprim/device/device_histogram_config.hpp b/rocprim/include/rocprim/device/device_histogram_config.hpp index 5d27c174b..b631e2a4b 100644 --- a/rocprim/include/rocprim/device/device_histogram_config.hpp +++ b/rocprim/include/rocprim/device/device_histogram_config.hpp @@ -32,26 +32,16 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template -constexpr histogram_config_params wrap_histogram_config() -{ - return histogram_config_params{ - {HistogramConfig::histogram::block_size, - HistogramConfig::histogram::items_per_thread, - HistogramConfig::histogram::size_limit}, - HistogramConfig::max_grid_size, - HistogramConfig::shared_impl_max_bins, - HistogramConfig::shared_impl_histograms - }; -} - template struct wrapped_histogram_config { + static_assert(std::is_same::value, + "Config must be a specialization of struct template histogram_config"); + template struct architecture_config { - static constexpr histogram_config_params params = wrap_histogram_config(); + static constexpr histogram_config_params params = HistogramConfig{}; }; }; @@ -62,10 +52,10 @@ struct wrapped_histogram_config(Arch), - Sample, - Channels, - ActiveChannels>>(); + = default_histogram_config(Arch), + Sample, + Channels, + ActiveChannels>{}; }; }; diff --git a/rocprim/include/rocprim/device/device_merge.hpp b/rocprim/include/rocprim/device/device_merge.hpp index 9a6fc3d12..30ef30e31 100644 --- a/rocprim/include/rocprim/device/device_merge.hpp +++ b/rocprim/include/rocprim/device/device_merge.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -216,8 +216,7 @@ hipError_t merge_impl(void * temporary_storage, /// if \p temporary_storage in a null pointer. /// * Accepts custom compare_functions for merging across the device. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p merge_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p merge_config or a class derived from it. /// \tparam InputIterator1 - random-access iterator type of the first input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam InputIterator2 - random-access iterator type of the second input range. Must meet the @@ -321,8 +320,7 @@ hipError_t merge(void * temporary_storage, /// if \p temporary_storage in a null pointer. /// * Accepts custom compare_functions for merging across the device. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p merge_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p merge_config or a class derived from it. /// \tparam KeysInputIterator1 - random-access iterator type of the first keys input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam KeysInputIterator2 - random-access iterator type of the second keys input range. Must meet the diff --git a/rocprim/include/rocprim/device/device_merge_sort.hpp b/rocprim/include/rocprim/device/device_merge_sort.hpp index f4d8365fc..6eb5b1808 100644 --- a/rocprim/include/rocprim/device/device_merge_sort.hpp +++ b/rocprim/include/rocprim/device/device_merge_sort.hpp @@ -61,13 +61,12 @@ ROCPRIM_KERNEL { static constexpr merge_sort_block_sort_config_params params = device_params(); block_sort_kernel_impl(keys_input, - keys_output, - values_input, - values_output, - sorted_block_size, - compare_function); + params.block_sort_config.items_per_thread>(keys_input, + keys_output, + values_input, + values_output, + sorted_block_size, + compare_function); } template::value; // if config is not custom, provide default value for merge sort limit - constexpr size_t merge_sort_limit = std:: - conditional, Config>::type::merge_sort_limit; + constexpr size_t merge_sort_limit + = std::conditional, Config>::type::merge_sort_limit; // Instantiate single sort config to find the threshold that determines which algorithm is used. @@ -732,8 +732,7 @@ inline hipError_t /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it. /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the @@ -839,8 +838,7 @@ hipError_t radix_sort_keys(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it. /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the @@ -947,8 +945,7 @@ hipError_t radix_sort_keys_desc(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it. /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the @@ -1073,8 +1070,7 @@ hipError_t radix_sort_pairs(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it. /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the @@ -1199,8 +1195,7 @@ hipError_t radix_sort_pairs_desc(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it. /// \tparam Key - key type. Must be an integral type or a floating-point type. /// \tparam Size - integral type that represents the problem size. /// @@ -1312,8 +1307,7 @@ hipError_t radix_sort_keys(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it. /// \tparam Key - key type. Must be an integral type or a floating-point type. /// \tparam Size - integral type that represents the problem size. /// @@ -1425,8 +1419,7 @@ hipError_t radix_sort_keys_desc(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it. /// \tparam Key - key type. Must be an integral type or a floating-point type. /// \tparam Value - value type. /// \tparam Size - integral type that represents the problem size. @@ -1553,8 +1546,7 @@ hipError_t radix_sort_pairs(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p radix_sort_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p radix_sort_config or a class derived from it. /// \tparam Key - key type. Must be an integral type or a floating-point type. /// \tparam Value - value type. /// \tparam Size - integral type that represents the problem size. diff --git a/rocprim/include/rocprim/device/device_radix_sort_config.hpp b/rocprim/include/rocprim/device/device_radix_sort_config.hpp index 1aca826e2..7bbdba6b1 100644 --- a/rocprim/include/rocprim/device/device_radix_sort_config.hpp +++ b/rocprim/include/rocprim/device/device_radix_sort_config.hpp @@ -48,7 +48,7 @@ template -struct radix_sort_config_v2 +struct radix_sort_config { #ifndef DOXYGEN_SHOULD_SKIP_THIS /// \brief Configuration of radix sort single kernel. @@ -62,52 +62,6 @@ struct radix_sort_config_v2 #endif }; -/// \brief Legacy configuration of device-level radix sort operation. -/// -/// \deprecated Due to a new implementation the configuration options no longer match the algorithm -/// parameters. Use `radix_sort_config_v2` for the new parameters of the algorithm. Only a best -/// effort mapping is provided for these options, parameters not applicable to the new algorithm -/// are ignored. -/// -/// Radix sort is executed in a single tile (at size < BlocksPerItem) or few iterations (passes) -/// depending on total number of bits to be sorted (\p begin_bit and \p end_bit), each iteration -/// sorts either \p LongRadixBits or \p ShortRadixBits bits, chosen to cover whole bit range in -/// optimal way. -/// -/// For example, if \p LongRadixBits is 7, \p ShortRadixBits is 6, \p begin_bit is 0 and \p end_bit -/// is 32 there will be 5 iterations: 7 + 7 + 6 + 6 + 6 = 32 bits. -/// -/// \tparam LongRadixBits - number of bits in long iterations. -/// \tparam ShortRadixBits - number of bits in short iterations, must be equal to or less than \p LongRadixBits. -/// \tparam ScanConfig - configuration of digits scan kernel. Must be \p kernel_config. -/// \tparam SortConfig - configuration of radix sort kernel. Must be \p kernel_config. -template, - class SortMergeConfig = kernel_config<1024, 1>, - unsigned int MergeSizeLimitBlocks = 1024U, - bool ForceSingleKernelConfig = false, - class OnesweepHistogramConfig = kernel_config<256, 8>, - class OnesweepSortConfig = kernel_config<256, 15>, - unsigned int OnesweepRadixBits = 4> -struct [[deprecated("use radix_sort_config_v2")]] radix_sort_config -{ -#ifndef DOXYGEN_SHOULD_SKIP_THIS - /// \brief Configuration of radix sort single kernel. - using single_sort_config = SortSingleConfig; - /// \brief Configuration of merge sort algorithm. - using merge_sort_config = default_config; - /// \brief Configuration of radix sort onesweep. - using onesweep_config = radix_sort_onesweep_config; - /// \brief Maximum number of items to use merge sort algorithm. - static constexpr size_t merge_sort_limit = 1024 * MergeSizeLimitBlocks; -#endif -}; - namespace detail { diff --git a/rocprim/include/rocprim/device/device_reduce.hpp b/rocprim/include/rocprim/device/device_reduce.hpp index 3db6b0661..649e1c583 100644 --- a/rocprim/include/rocprim/device/device_reduce.hpp +++ b/rocprim/include/rocprim/device/device_reduce.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -256,8 +256,7 @@ hipError_t reduce_impl(void * temporary_storage, /// * By default, the input type is used for accumulation. A custom type /// can be specified using rocprim::transform_iterator, see the example below. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p reduce_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the @@ -404,8 +403,7 @@ hipError_t reduce(void * temporary_storage, /// * By default, the input type is used for accumulation. A custom type /// can be specified using rocprim::transform_iterator, see the example below. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p reduce_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the diff --git a/rocprim/include/rocprim/device/device_reduce_by_key.hpp b/rocprim/include/rocprim/device/device_reduce_by_key.hpp index 051b63625..315f75668 100644 --- a/rocprim/include/rocprim/device/device_reduce_by_key.hpp +++ b/rocprim/include/rocprim/device/device_reduce_by_key.hpp @@ -215,14 +215,20 @@ hipError_t reduce_by_key_impl(void* temporary_storage, // The running accumulation across the launch boundary. accumulator_type* d_previous_accumulated = nullptr; + detail::temp_storage::layout layout{}; + const hipError_t layout_result + = scan_state_type::get_temp_storage_layout(number_of_tiles, stream, layout); + if(layout_result != hipSuccess) + { + return layout_result; + } + const hipError_t partition_result = detail::temp_storage::partition( temporary_storage, storage_size, detail::temp_storage::make_linear_partition( // This is valid even with scan_state_with_sleep_type - detail::temp_storage::make_partition( - &scan_state_storage, - scan_state_type::get_temp_storage_layout(number_of_tiles)), + detail::temp_storage::make_partition(&scan_state_storage, layout), detail::temp_storage::make_partition(&ordered_bid_storage, ordered_tile_id_type::get_temp_storage_layout()), detail::temp_storage::ptr_aligned_array(&d_global_head_count, use_limited_size ? 1 : 0), @@ -239,12 +245,23 @@ hipError_t reduce_by_key_impl(void* temporary_storage, { return result; } + + scan_state_type scan_state{}; + hipError_t scan_state_result + = scan_state_type::create(scan_state, scan_state_storage, number_of_tiles, stream); + scan_state_with_sleep_type scan_state_with_sleep{}; + scan_state_result = scan_state_with_sleep_type::create(scan_state_with_sleep, + scan_state_storage, + number_of_tiles, + stream); + + if(scan_state_result != hipSuccess) + { + return scan_state_result; + } + auto with_scan_state - = [use_sleep, - scan_state = scan_state_type::create(scan_state_storage, number_of_tiles), - scan_state_with_sleep - = scan_state_with_sleep_type::create(scan_state_storage, number_of_tiles)]( - auto&& func) mutable -> decltype(auto) + = [use_sleep, scan_state, scan_state_with_sleep](auto&& func) mutable -> decltype(auto) { if(use_sleep) { @@ -380,8 +397,7 @@ hipError_t reduce_by_key_impl(void* temporary_storage, /// * Ranges specified by \p unique_output and \p aggregates_output must have at least /// *unique_count_output (i.e. the number of unique keys) elements. /// -/// \tparam Config - [optional] configuration of the primitive. It can be `reduce_by_key_config_v2` -/// or `default_config` +/// \tparam Config - [optional] configuration of the primitive. It has to be `reduce_by_key_config` or a class derived from it. /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam ValuesInputIterator - random-access iterator type of the input range. Must meet the diff --git a/rocprim/include/rocprim/device/device_reduce_by_key_config.hpp b/rocprim/include/rocprim/device/device_reduce_by_key_config.hpp index 6bcf9dca3..e38426467 100644 --- a/rocprim/include/rocprim/device/device_reduce_by_key_config.hpp +++ b/rocprim/include/rocprim/device/device_reduce_by_key_config.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -54,7 +54,7 @@ template -struct reduce_by_key_config_v2 +struct reduce_by_key_config { /// Number of threads in a block. static constexpr unsigned int block_size = BlockSize; @@ -81,25 +81,6 @@ struct reduce_by_key_config_v2 static constexpr unsigned int size_limit = SizeLimit; }; -/// \brief Legacy configuration of device-level reduce-by-key operation. -/// -/// \deprecated Due to a new implementation the configuration options no longer match the algorithm -/// parameters. Use `reduce_by_key_config_v2` for the new parameters of the algorithm. Only a best -/// effort mapping is provided for these options, parameters not applicable to the new algorithm -/// are ignored. -/// -/// \tparam ScanConfig - configuration of carry-outs scan kernel. Must be \p kernel_config. -/// \tparam ReduceConfig - configuration of the main reduce-by-key kernel. Must be \p kernel_config. -template -struct [[deprecated("use reduce_by_key_config_v2")]] reduce_by_key_config - : reduce_by_key_config_v2 -{ - /// \brief Configuration of carry-outs scan kernel. - using scan = ScanConfig; - /// \brief Configuration of the main reduce-by-key kernel. - using reduce = ReduceConfig; -}; - namespace detail { @@ -117,25 +98,25 @@ struct fallback_config static constexpr unsigned int items_per_thread = std::max(1u, 15u / item_scale); using type - = reduce_by_key_config_v2::value, - items_per_thread, - block_load_method::block_load_transpose, - block_load_method::block_load_transpose, - block_scan_algorithm::using_warp_scan, - 2>; + = reduce_by_key_config::value, + items_per_thread, + block_load_method::block_load_transpose, + block_load_method::block_load_transpose, + block_scan_algorithm::using_warp_scan, + 2>; }; template struct default_config : std::conditional_t, + rocprim::reduce_by_key_config<256, + 15, + block_load_method::block_load_transpose, + block_load_method::block_load_transpose, + block_scan_algorithm::using_warp_scan, + sizeof(Value) < 16 ? 1 : 2>, typename reduce_by_key::fallback_config::type> {}; diff --git a/rocprim/include/rocprim/device/device_reduce_config.hpp b/rocprim/include/rocprim/device/device_reduce_config.hpp index f432ddc51..45f046453 100644 --- a/rocprim/include/rocprim/device/device_reduce_config.hpp +++ b/rocprim/include/rocprim/device/device_reduce_config.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -36,6 +36,9 @@ namespace detail template struct wrapped_reduce_config { + static_assert(std::is_same::value, + "Config must be a specialization of struct template reduce_config"); + template struct architecture_config { diff --git a/rocprim/include/rocprim/device/device_run_length_encode.hpp b/rocprim/include/rocprim/device/device_run_length_encode.hpp index d155edb5b..109561cb1 100644 --- a/rocprim/include/rocprim/device/device_run_length_encode.hpp +++ b/rocprim/include/rocprim/device/device_run_length_encode.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -76,8 +76,7 @@ namespace detail /// * Ranges specified by \p unique_output and \p counts_output must have at least /// *runs_count_output (i.e. the number of runs) elements. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p run_length_encode_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p run_length_encode_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam UniqueOutputIterator - random-access iterator type of the output range. Must meet the @@ -192,8 +191,7 @@ hipError_t run_length_encode(void * temporary_storage, /// * Ranges specified by \p offsets_output and \p counts_output must have at least /// *runs_count_output (i.e. the number of non-trivial runs) elements. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p run_length_encode_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p run_length_encode_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam OffsetsOutputIterator - random-access iterator type of the output range. Must meet the diff --git a/rocprim/include/rocprim/device/device_run_length_encode_config.hpp b/rocprim/include/rocprim/device/device_run_length_encode_config.hpp index c2ace005e..870fd41fc 100644 --- a/rocprim/include/rocprim/device/device_run_length_encode_config.hpp +++ b/rocprim/include/rocprim/device/device_run_length_encode_config.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -36,7 +36,7 @@ BEGIN_ROCPRIM_NAMESPACE /// \brief Configuration of device-level run-length encoding operation. /// /// \tparam ReduceByKeyConfig - configuration of device-level reduce-by-key operation. -/// Must be \p reduce_by_key_config_v2 or \p default_config. +/// Must be \p reduce_by_key_config or \p default_config. /// \tparam SelectConfig - configuration of device-level select operation. /// Must be \p select_config or \p default_config. template< diff --git a/rocprim/include/rocprim/device/device_scan.hpp b/rocprim/include/rocprim/device/device_scan.hpp index 003ed4f95..2cb648dad 100644 --- a/rocprim/include/rocprim/device/device_scan.hpp +++ b/rocprim/include/rocprim/device/device_scan.hpp @@ -226,14 +226,20 @@ inline auto scan_impl(void* temporary_storage, real_init_value_type* previous_last_element; real_init_value_type* new_last_element; + detail::temp_storage::layout layout{}; + hipError_t layout_result + = scan_state_type::get_temp_storage_layout(number_of_blocks, stream, layout); + if(layout_result != hipSuccess) + { + return layout_result; + } + const hipError_t partition_result = detail::temp_storage::partition( temporary_storage, storage_size, detail::temp_storage::make_linear_partition( // This is valid even with offset_scan_state_with_sleep_type - detail::temp_storage::make_partition( - &scan_state_storage, - scan_state_type::get_temp_storage_layout(number_of_blocks)), + detail::temp_storage::make_partition(&scan_state_storage, layout), detail::temp_storage::ptr_aligned_array(&previous_last_element, use_limited_size ? 1 : 0), detail::temp_storage::ptr_aligned_array(&new_last_element, use_limited_size ? 1 : 0))); @@ -251,9 +257,18 @@ inline auto scan_impl(void* temporary_storage, if(number_of_blocks > 1 || use_limited_size) { // Create and initialize lookback_scan_state obj - auto scan_state = scan_state_type::create(scan_state_storage, number_of_blocks); - auto scan_state_with_sleep - = scan_state_with_sleep_type::create(scan_state_storage, number_of_blocks); + scan_state_type scan_state{}; + hipError_t result + = scan_state_type::create(scan_state, scan_state_storage, number_of_blocks, stream); + scan_state_with_sleep_type scan_state_with_sleep{}; + result = scan_state_with_sleep_type::create(scan_state_with_sleep, + scan_state_storage, + number_of_blocks, + stream); + if(result != hipSuccess) + { + return result; + } hipDeviceProp_t prop; int deviceId; @@ -442,7 +457,7 @@ inline auto scan_impl(void* temporary_storage, /// * By default, the input type is used for accumulation. A custom type /// can be specified using rocprim::transform_iterator, see the example below. /// -/// \tparam Config - [optional] configuration of the primitive, should be \p scan_config_v2. +/// \tparam Config - [optional] configuration of the primitive, has to be \p scan_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the @@ -571,7 +586,7 @@ inline hipError_t inclusive_scan(void* temporary_storage, /// if \p temporary_storage in a null pointer. /// * Ranges specified by \p input and \p output must have at least \p size elements. /// -/// \tparam Config - [optional] configuration of the primitive, should be \p scan_config_v2. +/// \tparam Config - [optional] configuration of the primitive, has to be \p scan_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the diff --git a/rocprim/include/rocprim/device/device_scan_by_key.hpp b/rocprim/include/rocprim/device/device_scan_by_key.hpp index 7b59a0cac..8d8c4fecb 100644 --- a/rocprim/include/rocprim/device/device_scan_by_key.hpp +++ b/rocprim/include/rocprim/device/device_scan_by_key.hpp @@ -155,14 +155,20 @@ inline hipError_t scan_by_key_impl(void* const temporary_storage, void* scan_state_storage; wrapped_type* previous_last_value; + detail::temp_storage::layout layout{}; + const hipError_t layout_result + = scan_state_type::get_temp_storage_layout(number_of_blocks, stream, layout); + if(layout_result != hipSuccess) + { + return layout_result; + } + const hipError_t partition_result = detail::temp_storage::partition( temporary_storage, storage_size, detail::temp_storage::make_linear_partition( // This is valid even with offset_scan_state_with_sleep_type - detail::temp_storage::make_partition( - &scan_state_storage, - scan_state_type::get_temp_storage_layout(number_of_blocks)), + detail::temp_storage::make_partition(&scan_state_storage, layout), detail::temp_storage::ptr_aligned_array(&previous_last_value, use_limited_size ? 1 : 0))); if(partition_result != hipSuccess || temporary_storage == nullptr) @@ -181,14 +187,23 @@ inline hipError_t scan_by_key_impl(void* const temporary_storage, return error; } + scan_state_type scan_state{}; + hipError_t scan_state_result + = scan_state_type::create(scan_state, scan_state_storage, number_of_blocks, stream); + scan_state_with_sleep_type scan_state_with_sleep{}; + scan_state_result = scan_state_with_sleep_type::create(scan_state_with_sleep, + scan_state_storage, + number_of_blocks, + stream); + if(scan_state_result != hipSuccess) + { + return scan_state_result; + } + // Call the provided function with either scan_state or scan_state_with_sleep based on // the value of use_sleep_scan_state auto with_scan_state - = [use_sleep, - scan_state = scan_state_type::create(scan_state_storage, number_of_blocks), - scan_state_with_sleep - = scan_state_with_sleep_type::create(scan_state_storage, number_of_blocks)]( - auto&& func) mutable -> decltype(auto) + = [use_sleep, scan_state, scan_state_with_sleep](auto&& func) mutable -> decltype(auto) { if(use_sleep) { @@ -305,7 +320,7 @@ inline hipError_t scan_by_key_impl(void* const temporary_storage, /// * Ranges specified by \p keys_input, \p values_input, and \p values_output must have /// at least \p size elements. /// -/// \tparam Config - [optional] configuration of the primitive, should be \p scan_by_key_config_v2. +/// \tparam Config - [optional] configuration of the primitive, has to be \p scan_by_key_config or a class derived from it. /// \tparam KeysInputIterator - random-access iterator type of the input range. It can be /// a simple pointer type. /// \tparam ValuesInputIterator - random-access iterator type of the input range. It can be @@ -428,7 +443,7 @@ inline hipError_t inclusive_scan_by_key(void* const temporary_sto /// * Ranges specified by \p keys_input, \p values_input, and \p values_output must have /// at least \p size elements. /// -/// \tparam Config - [optional] configuration of the primitive, should be \p scan_by_key_config_v2. +/// \tparam Config - [optional] configuration of the primitive, has to be \p scan_by_key_config or a class derived from it. /// \tparam KeysInputIterator - random-access iterator type of the input range. It can be /// a simple pointer type. /// \tparam ValuesInputIterator - random-access iterator type of the input range. It can be diff --git a/rocprim/include/rocprim/device/device_scan_by_key_config.hpp b/rocprim/include/rocprim/device/device_scan_by_key_config.hpp index e18ab12fb..7018b874d 100644 --- a/rocprim/include/rocprim/device/device_scan_by_key_config.hpp +++ b/rocprim/include/rocprim/device/device_scan_by_key_config.hpp @@ -32,27 +32,16 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template -constexpr scan_by_key_config_params wrap_scan_by_key_config() -{ - return scan_by_key_config_params{ - {ScanByKeyConfig::block_size, - ScanByKeyConfig::items_per_thread, - ScanByKeyConfig::size_limit}, - ScanByKeyConfig::block_load_method, - ScanByKeyConfig::block_store_method, - ScanByKeyConfig::block_scan_method - }; -} - template struct wrapped_scan_by_key_config { + static_assert(std::is_same::value, + "Config must be a specialization of struct template scan_by_key_config"); + template struct architecture_config { - static constexpr scan_by_key_config_params params - = wrap_scan_by_key_config(); + static constexpr scan_by_key_config_params params = ScanByKeyConfig{}; }; }; @@ -62,8 +51,8 @@ struct wrapped_scan_by_key_config template struct architecture_config { - static constexpr scan_by_key_config_params params = wrap_scan_by_key_config< - default_scan_by_key_config(Arch), Key, Value>>(); + static constexpr scan_by_key_config_params params + = default_scan_by_key_config(Arch), Key, Value>{}; }; }; diff --git a/rocprim/include/rocprim/device/device_scan_config.hpp b/rocprim/include/rocprim/device/device_scan_config.hpp index 1ebf39636..f2a4254da 100644 --- a/rocprim/include/rocprim/device/device_scan_config.hpp +++ b/rocprim/include/rocprim/device/device_scan_config.hpp @@ -32,24 +32,15 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { -template -constexpr scan_config_params wrap_scan_config() -{ - return scan_config_params{ - {ScanConfig::block_size, ScanConfig::items_per_thread, ScanConfig::size_limit}, - ScanConfig::block_load_method, - ScanConfig::block_store_method, - ScanConfig::block_scan_method - }; -} - template struct wrapped_scan_config { + static_assert(std::is_same::value, + "Config must be a specialization of struct template scan_config"); template struct architecture_config { - static constexpr scan_config_params params = wrap_scan_config(); + static constexpr scan_config_params params = ScanConfig{}; }; }; @@ -60,7 +51,7 @@ struct wrapped_scan_config struct architecture_config { static constexpr scan_config_params params - = wrap_scan_config(Arch), Value>>(); + = default_scan_config(Arch), Value>{}; }; }; diff --git a/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp b/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp index 18c710f04..576789f06 100644 --- a/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp +++ b/rocprim/include/rocprim/device/device_segmented_radix_sort.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -589,8 +589,8 @@ hipError_t segmented_radix_sort_impl(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be -/// \p segmented_radix_sort_config or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be +/// \p segmented_radix_sort_config or a class derived from it. /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the @@ -711,8 +711,8 @@ hipError_t segmented_radix_sort_keys(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be -/// \p segmented_radix_sort_config or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be +/// \p segmented_radix_sort_config or a class derived from it. /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the @@ -834,8 +834,8 @@ hipError_t segmented_radix_sort_keys_desc(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be -/// \p segmented_radix_sort_config or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be +/// \p segmented_radix_sort_config or a class derived from it. /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the @@ -975,8 +975,8 @@ hipError_t segmented_radix_sort_pairs(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be -/// \p segmented_radix_sort_config or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be +/// \p segmented_radix_sort_config or a class derived from it. /// \tparam KeysInputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam KeysOutputIterator - random-access iterator type of the output range. Must meet the @@ -1116,8 +1116,8 @@ hipError_t segmented_radix_sort_pairs_desc(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be -/// \p segmented_radix_sort_config or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be +/// \p segmented_radix_sort_config or a class derived from it. /// \tparam Key - key type. Must be an integral type or a floating-point type. /// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the /// requirements of a C++ OutputIterator concept. It can be a simple pointer type. @@ -1244,8 +1244,8 @@ hipError_t segmented_radix_sort_keys(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be -/// \p segmented_radix_sort_config or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be +/// \p segmented_radix_sort_config or a class derived from it. /// \tparam Key - key type. Must be an integral type or a floating-point type. /// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the /// requirements of a C++ OutputIterator concept. It can be a simple pointer type. @@ -1372,8 +1372,8 @@ hipError_t segmented_radix_sort_keys_desc(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be -/// \p segmented_radix_sort_config or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be +/// \p segmented_radix_sort_config or a class derived from it. /// \tparam Key - key type. Must be an integral type or a floating-point type. /// \tparam Value - value type. /// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the @@ -1515,8 +1515,8 @@ hipError_t segmented_radix_sort_pairs(void * temporary_storage, /// can be improved by setting \p begin_bit and \p end_bit, for example if all keys are in range /// [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range. /// -/// \tparam Config - [optional] configuration of the primitive. It can be -/// \p segmented_radix_sort_config or a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be +/// \p segmented_radix_sort_config or a class derived from it. /// \tparam Key - key type. Must be an integral type or a floating-point type. /// \tparam Value - value type. /// \tparam OffsetIterator - random-access iterator type of segment offsets. Must meet the diff --git a/rocprim/include/rocprim/device/device_segmented_reduce.hpp b/rocprim/include/rocprim/device/device_segmented_reduce.hpp index aeeb35238..424b291ec 100644 --- a/rocprim/include/rocprim/device/device_segmented_reduce.hpp +++ b/rocprim/include/rocprim/device/device_segmented_reduce.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -161,8 +161,7 @@ hipError_t segmented_reduce_impl(void * temporary_storage, /// segments + 1 elements: offsets for \p begin_offsets and /// offsets + 1 for \p end_offsets. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p reduce_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p reduce_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the diff --git a/rocprim/include/rocprim/device/device_segmented_scan.hpp b/rocprim/include/rocprim/device/device_segmented_scan.hpp index af1a3e446..4cad6e0f4 100644 --- a/rocprim/include/rocprim/device/device_segmented_scan.hpp +++ b/rocprim/include/rocprim/device/device_segmented_scan.hpp @@ -166,8 +166,7 @@ hipError_t segmented_scan_impl(void * temporary_storage, /// segments + 1 elements: offsets for \p begin_offsets and /// offsets + 1 for \p end_offsets. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p scan_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type. /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the @@ -281,8 +280,7 @@ hipError_t segmented_inclusive_scan(void * temporary_storage, /// segments + 1 elements: offsets for \p begin_offsets and /// offsets + 1 for \p end_offsets. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p scan_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type. /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the @@ -399,8 +397,7 @@ hipError_t segmented_exclusive_scan(void * temporary_storage, /// * Ranges specified by \p input, \p output, and \p flags must have at least \p size elements. /// * \p value_type of \p HeadFlagIterator iterator should be convertible to \p bool type. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p scan_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type. /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the @@ -512,8 +509,7 @@ hipError_t segmented_inclusive_scan(void * temporary_storage, /// * Ranges specified by \p input, \p output, and \p flags must have at least \p size elements. /// * \p value_type of \p HeadFlagIterator iterator should be convertible to \p bool type. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p scan_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p scan_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type. /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the diff --git a/rocprim/include/rocprim/device/device_select.hpp b/rocprim/include/rocprim/device/device_select.hpp index b2ade6a92..68ce3bbb5 100644 --- a/rocprim/include/rocprim/device/device_select.hpp +++ b/rocprim/include/rocprim/device/device_select.hpp @@ -58,8 +58,7 @@ namespace detail /// * Range specified by \p selected_count_output must have at least 1 element. /// * Values of \p flag range should be implicitly convertible to `bool` type. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p select_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. It can be /// a simple pointer type. /// \tparam FlagIterator - random-access iterator type of the flag range. It can be @@ -182,8 +181,7 @@ hipError_t select(void * temporary_storage, /// values can be copied into it. /// * Range specified by \p selected_count_output must have at least 1 element. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p select_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p select_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. It can be /// a simple pointer type. /// \tparam OutputIterator - random-access iterator type of the output range. It can be diff --git a/rocprim/include/rocprim/device/device_transform.hpp b/rocprim/include/rocprim/device/device_transform.hpp index e7fb6cea5..a9de7d827 100644 --- a/rocprim/include/rocprim/device/device_transform.hpp +++ b/rocprim/include/rocprim/device/device_transform.hpp @@ -35,11 +35,11 @@ #include "device_transform_config.hpp" #include "detail/device_transform.hpp" -BEGIN_ROCPRIM_NAMESPACE - /// \addtogroup devicemodule /// @{ +BEGIN_ROCPRIM_NAMESPACE + namespace detail { @@ -82,8 +82,7 @@ ROCPRIM_KERNEL /// \par Overview /// * Ranges specified by \p input and \p output must have at least \p size elements. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p transform_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p transform_config or a class derived from it. /// \tparam InputIterator - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam OutputIterator - random-access iterator type of the output range. Must meet the @@ -208,8 +207,7 @@ inline hipError_t transform(InputIterator input, /// \par Overview /// * Ranges specified by \p input1, \p input2, and \p output must have at least \p size elements. /// -/// \tparam Config - [optional] configuration of the primitive. It can be \p transform_config or -/// a custom class with the same members. +/// \tparam Config - [optional] configuration of the primitive. It has to be \p transform_config or a class derived from it. /// \tparam InputIterator1 - random-access iterator type of the input range. Must meet the /// requirements of a C++ InputIterator concept. It can be a simple pointer type. /// \tparam InputIterator2 - random-access iterator type of the input range. Must meet the @@ -285,9 +283,9 @@ hipError_t transform(InputIterator1 input1, #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR +END_ROCPRIM_NAMESPACE + /// @} // end of group devicemodule -END_ROCPRIM_NAMESPACE - #endif // ROCPRIM_DEVICE_DEVICE_TRANSFORM_HPP_ diff --git a/rocprim/include/rocprim/device/device_transform_config.hpp b/rocprim/include/rocprim/device/device_transform_config.hpp index 3b4ed21cf..bf6c6c1a2 100644 --- a/rocprim/include/rocprim/device/device_transform_config.hpp +++ b/rocprim/include/rocprim/device/device_transform_config.hpp @@ -42,25 +42,16 @@ template struct default_transform_config : default_transform_config_base {}; -template -constexpr transform_config_params wrap_transform_config() -{ - return transform_config_params{ - { - TransformConfig::block_size, - TransformConfig::items_per_thread, - TransformConfig::size_limit, - } - }; -} - template struct wrapped_transform_config { + static_assert(std::is_base_of::value, + "Config must be a specialization of struct template transform_config"); + template struct architecture_config { - static constexpr transform_config_params params = wrap_transform_config(); + static constexpr transform_config_params params = TransformConfig{}; }; }; @@ -70,8 +61,8 @@ struct wrapped_transform_config template struct architecture_config { - static constexpr transform_config_params params = wrap_transform_config< - default_transform_config(Arch), Value>>(); + static constexpr transform_config_params params + = default_transform_config(Arch), Value>{}; }; }; diff --git a/rocprim/include/rocprim/intrinsics/thread.hpp b/rocprim/include/rocprim/intrinsics/thread.hpp index 3a3a664c1..d5949b601 100644 --- a/rocprim/include/rocprim/intrinsics/thread.hpp +++ b/rocprim/include/rocprim/intrinsics/thread.hpp @@ -44,24 +44,6 @@ constexpr unsigned int warp_size() return warpSize; } -/// \brief Returns a number of threads in a hardware warp for the actual device. -/// At host side this constant is available at runtime time only. -/// -/// It is constant for a device. -ROCPRIM_HOST inline -unsigned int host_warp_size() -{ - int default_hip_device; - hipError_t success = hipGetDevice(&default_hip_device); - hipDeviceProp_t device_prop; - success = hipGetDeviceProperties(&device_prop,default_hip_device); - - if(success != hipSuccess) - return -1; - else - return device_prop.warpSize; -}; - /// \brief Returns a number of threads in a hardware warp for the actual target. /// At device side this constant is available at compile time. /// diff --git a/rocprim/include/rocprim/intrinsics/warp.hpp b/rocprim/include/rocprim/intrinsics/warp.hpp index 1e09fd74d..7a25d3cc9 100644 --- a/rocprim/include/rocprim/intrinsics/warp.hpp +++ b/rocprim/include/rocprim/intrinsics/warp.hpp @@ -117,18 +117,26 @@ int warp_all(int predicate) /// @} // end of group intrinsicsmodule -/** - * This function computes a lane mask of active lanes in the warp which which have - * the same value for label as the lane which calls the function. The bit at - * index \p i in the lane mask is set if the thread of lane \p i calls this function - * with the same value label. Only the least-significant \p LabelBits bits - * are taken into account when labels are considered to be equal. - */ +/// \brief Group active lanes having the same bits of \p label +/// +/// Threads that have the same least significant \p LabelBits bits are grouped into the same group. +/// Every lane in the warp receives a mask of all active lanes participating in its group. +/// +/// \tparam LabelBits number of bits to compare between labels +/// +/// \param [in] label the label for the calling lane +/// \param [in] valid lanes passing false will be ignored for comparisons, +/// such lanes will not be part of any group, and will always return an empty mask (0) +/// +/// \return A bit mask of lanes sharing the same bits for \p label. The bit at index +/// lane i's result includes bit j in the lane mask if lane j is part +/// of the same group as lane i, i.e. lane i and j called with the +/// same value for label. template -ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type match_any(unsigned int label) +ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type match_any(unsigned int label, bool valid = true) { // Obtain a mask with the threads which are currently active. - lane_mask_type peer_mask = ballot(1); + lane_mask_type peer_mask = ballot(valid); // Compute the final value iteratively by testing each bit separately. ROCPRIM_UNROLL @@ -141,21 +149,24 @@ ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type match_any(unsigned int label) peer_mask &= (bit_set ? same_mask : ~same_mask); } - return peer_mask; + return -lane_mask_type{valid} & peer_mask; } -/** - * This function computes a lane mask of active lanes in the warp which which have - * the same value for label as the lane which calls the function. The bit at - * index \p i in the lane mask is set if the thread of lane \p i calls this function - * with the same value label. Only the least-significant \p LabelBits bits - * are taken into account when labels are considered to be equal. - */ -template -[[deprecated("use rocprim::match_any instead")]] ROCPRIM_DEVICE ROCPRIM_INLINE lane_mask_type - MatchAny(unsigned int label) +/// \brief Elect a single lane for each group in \p mask +/// +/// \param [in] mask bit mask of the lanes in the same group as the calling lane. +/// The i-th bit should be set if lane i is in the same group +/// as the calling lane. +/// +/// \returns true for one unspecified lane in the mask, false for everyone else. +/// Returns false for all lanes not in any group, that is lanes passing 0 as \p mask. +/// +/// \pre The relation specified by \p mask must be symmetric and transitive, in other words: the groups +/// should be consistent between threads. +ROCPRIM_DEVICE ROCPRIM_INLINE bool group_elect(lane_mask_type mask) { - return match_any(label); + const unsigned int prev_same_count = ::rocprim::masked_bit_count(mask); + return prev_same_count == 0 && mask != 0; } END_ROCPRIM_NAMESPACE diff --git a/rocprim/include/rocprim/rocprim.hpp b/rocprim/include/rocprim/rocprim.hpp index c2b587f5b..6a2ecabf8 100644 --- a/rocprim/include/rocprim/rocprim.hpp +++ b/rocprim/include/rocprim/rocprim.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -45,6 +45,7 @@ #include "block/block_histogram.hpp" #include "block/block_load.hpp" #include "block/block_radix_sort.hpp" +#include "block/block_run_length_decode.hpp" #include "block/block_scan.hpp" #include "block/block_sort.hpp" #include "block/block_store.hpp" diff --git a/rocprim/include/rocprim/thread/thread_search.hpp b/rocprim/include/rocprim/thread/thread_search.hpp index 54fe38d25..c6b6e7b03 100644 --- a/rocprim/include/rocprim/thread/thread_search.hpp +++ b/rocprim/include/rocprim/thread/thread_search.hpp @@ -75,33 +75,24 @@ ROCPRIM_HOST_DEVICE inline void merge_path_search( path_coordinate.y = diagonal - split_min; } - - - /// \brief Returns the offset of the first value within \p input which does not compare less than \p val /// \tparam InputIteratorT - [inferred] Type of iterator for the input data to be searched /// \tparam OffsetT - [inferred] The data type of num_items /// \tparam T - [inferred] The data type of the input sequence elements /// \param input [in] - Input sequence -/// \param num_items [out] - Input sequence length +/// \param num_items [in] - Input sequence length /// \param val [in] - Search Key /// \return - Offset at which val was found -template < - typename InputIteratorT, - typename OffsetT, - typename T> -ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT lower_bound( - InputIteratorT input, - OffsetT num_items, - T val) +template +ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT lower_bound(InputIteratorT input, OffsetT num_items, T val) { OffsetT retval = 0; - while (num_items > 0) + while(num_items > 0) { OffsetT half = num_items >> 1; - if (input[retval + half] < val) + if(input[retval + half] < val) { - retval = retval + (half + 1); + retval = retval + (half + 1); num_items = num_items - (half + 1); } else @@ -113,35 +104,28 @@ ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT lower_bound( return retval; } - /// \brief Returns the offset of the first value within \p input which compares greater than \p val /// \tparam InputIteratorT - [inferred] Type of iterator for the input data to be searched /// \tparam OffsetT - [inferred] The data type of num_items /// \tparam T - [inferred] The data type of the input sequence elements /// \param input [in] - Input sequence -/// \param num_items [out] - Input sequence length +/// \param num_items [in] - Input sequence length /// \param val [in] - Search Key /// \return - Offset at which val was found -template < - typename InputIteratorT, - typename OffsetT, - typename T> -ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT upper_bound( - InputIteratorT input, ///< [in] Input sequence - OffsetT num_items, ///< [in] Input sequence length - T val) ///< [in] Search key +template +ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT upper_bound(InputIteratorT input, OffsetT num_items, T val) { OffsetT retval = 0; - while (num_items > 0) + while(num_items > 0) { OffsetT half = num_items >> 1; - if (val < input[retval + half]) + if(val < input[retval + half]) { num_items = half; } else { - retval = retval + (half + 1); + retval = retval + (half + 1); num_items = num_items - (half + 1); } } @@ -149,6 +133,42 @@ ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT upper_bound( return retval; } +/// \brief Returns the offset of the first value within \p input which compares greater than \p val +/// computed as a statically unrolled loop +/// \tparam MaxNumItems - The maximum number of items. +/// \tparam InputIteratorT - [inferred] Type of iterator for the input data to be searched +/// \tparam OffsetT - [inferred] The data type of num_items +/// \tparam T - [inferred] The data type of the input sequence elements +/// \param input [in] - Input sequence +/// \param num_items [in] - Input sequence length +/// \param val [in] - Search Key +/// \return - Offset at which val was found +template +ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT static_upper_bound(InputIteratorT input, + OffsetT num_items, + T val) +{ + OffsetT lower_bound = 0; + OffsetT upper_bound = num_items; +#pragma unroll + for(int i = 0; i <= Log2::VALUE; i++) + { + OffsetT mid = lower_bound + (upper_bound - lower_bound) / 2; + mid = rocprim::min(mid, num_items - 1); + + if(val < input[mid]) + { + upper_bound = mid; + } + else + { + lower_bound = mid + 1; + } + } + + return lower_bound; +} + END_ROCPRIM_NAMESPACE #endif // ROCPRIM_THREAD_THREAD_SCAN_HPP_ diff --git a/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp b/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp index 8d1bc20f2..df567ae33 100644 --- a/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_reduce_dpp.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -73,13 +73,7 @@ class warp_reduce_dpp // row_shr:8 output = reduce_op(warp_move_dpp(output), output); } -#if ROCPRIM_NAVI - if(WarpSize > 16) - { - // row_bcast:15 - output = reduce_op(warp_swizzle(output), output); - } -#else +#ifdef ROCPRIM_DETAIL_HAS_DPP_BROADCAST if(WarpSize > 16) { // row_bcast:15 @@ -90,6 +84,14 @@ class warp_reduce_dpp // row_bcast:31 output = reduce_op(warp_move_dpp(output), output); } + static_assert(WarpSize <= 64, "WarpSize > 64 is not supported"); +#else + if(WarpSize > 16) + { + // row_bcast:15 + output = reduce_op(warp_swizzle(output), output); + } + static_assert(WarpSize <= 32, "WarpSize > 32 is not supported without DPP broadcasts"); #endif // Read the result from the last lane of the logical warp output = warp_shuffle(output, WarpSize - 1, WarpSize); diff --git a/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp b/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp index cbe13674f..9ce2350ba 100644 --- a/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_scan_dpp.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -74,13 +74,7 @@ class warp_scan_dpp T t = scan_op(warp_move_dpp(output), output); // row_shr:8 if(row_lane_id >= 8) output = t; } -#if ROCPRIM_NAVI - if(WarpSize > 16) - { - T t = scan_op(warp_swizzle(output), output); // row_bcast:15 - if(lane_id % 32 >= 16) output = t; - } -#else +#ifdef ROCPRIM_DETAIL_HAS_DPP_BROADCAST if(WarpSize > 16) { T t = scan_op(warp_move_dpp(output), output); // row_bcast:15 @@ -91,6 +85,15 @@ class warp_scan_dpp T t = scan_op(warp_move_dpp(output), output); // row_bcast:31 if(lane_id >= 32) output = t; } + static_assert(WarpSize <= 64, "WarpSize > 64 is not supported"); +#else + if(WarpSize > 16) + { + T t = scan_op(warp_swizzle(output), output); // row_bcast:15 + if(lane_id % 32 >= 16) + output = t; + } + static_assert(WarpSize <= 32, "WarpSize > 32 is not supported without DPP broadcasts"); #endif } diff --git a/scripts/autotune/create_optimization.py b/scripts/autotune/create_optimization.py index d8843fa7c..677e02f3e 100755 --- a/scripts/autotune/create_optimization.py +++ b/scripts/autotune/create_optimization.py @@ -459,6 +459,22 @@ class AlgorithmDeviceLowerBound(Algorithm): def __init__(self, fallback_entries): Algorithm.__init__(self, fallback_entries) +class AlgorithmDeviceAdjacentDifference(Algorithm): + algorithm_name = 'device_adjacent_difference' + cpp_configuration_template_name = 'adjacent_difference_config_template' + config_selection_params = [ + SelectionType(name='value_type', is_optional=False)] + def __init__(self, fallback_entries): + Algorithm.__init__(self, fallback_entries) + +class AlgorithmDeviceAdjacentDifferenceInplace(Algorithm): + algorithm_name = 'device_adjacent_difference_inplace' + cpp_configuration_template_name = 'adjacent_difference_inplace_config_template' + config_selection_params = [ + SelectionType(name='value_type', is_optional=False)] + def __init__(self, fallback_entries): + Algorithm.__init__(self, fallback_entries) + def filt_algo_regex(e, algorithm_name): if 'algo_regex' in e: return re.match(e['algo_regex'], algorithm_name) is not None @@ -488,6 +504,10 @@ def create_algorithm(algorithm_name: str, fallback_entries): return AlgorithmDeviceUpperBound(fallback_entries) elif algorithm_name == 'device_lower_bound': return AlgorithmDeviceLowerBound(fallback_entries) + elif algorithm_name == 'device_adjacent_difference': + return AlgorithmDeviceAdjacentDifference(fallback_entries) + elif algorithm_name == 'device_adjacent_difference_inplace': + return AlgorithmDeviceAdjacentDifferenceInplace(fallback_entries) else: raise(NotSupportedError(f'Algorithm "{algorithm_name}" is not supported (yet)')) diff --git a/scripts/autotune/templates/adjacent_difference_config_template b/scripts/autotune/templates/adjacent_difference_config_template new file mode 100644 index 000000000..f40ad24cd --- /dev/null +++ b/scripts/autotune/templates/adjacent_difference_config_template @@ -0,0 +1,20 @@ +{% extends "config_template" %} + +{% macro get_header_guard() %} +ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_HPP_ +{%- endmacro %} + +{% macro kernel_configuration(measurement) -%} +adjacent_difference_config<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}> { }; +{%- endmacro %} + +{% macro general_case() -%} +template +struct default_adjacent_difference_config : default_adjacent_difference_config_base +{}; +{%- endmacro %} + +{% macro configuration_fallback(benchmark_of_architecture, based_on_type, fallback_selection_criteria) -%} +// Based on {{ based_on_type }} +template struct default_adjacent_difference_config({{ benchmark_of_architecture.name }}), value_type, {{ fallback_selection_criteria }}> : +{%- endmacro %} diff --git a/scripts/autotune/templates/adjacent_difference_inplace_config_template b/scripts/autotune/templates/adjacent_difference_inplace_config_template new file mode 100644 index 000000000..1031bf5e0 --- /dev/null +++ b/scripts/autotune/templates/adjacent_difference_inplace_config_template @@ -0,0 +1,20 @@ +{% extends "config_template" %} + +{% macro get_header_guard() %} +ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_ADJACENT_DIFFERENCE_INPLACE_HPP_ +{%- endmacro %} + +{% macro kernel_configuration(measurement) -%} +adjacent_difference_config<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}> { }; +{%- endmacro %} + +{% macro general_case() -%} +template +struct default_adjacent_difference_inplace_config : default_adjacent_difference_config_base +{}; +{%- endmacro %} + +{% macro configuration_fallback(benchmark_of_architecture, based_on_type, fallback_selection_criteria) -%} +// Based on {{ based_on_type }} +template struct default_adjacent_difference_inplace_config({{ benchmark_of_architecture.name }}), value_type, {{ fallback_selection_criteria }}> : +{%- endmacro %} diff --git a/scripts/autotune/templates/scan_config_template b/scripts/autotune/templates/scan_config_template index 2650752b4..02f4fafa5 100644 --- a/scripts/autotune/templates/scan_config_template +++ b/scripts/autotune/templates/scan_config_template @@ -5,7 +5,7 @@ ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_SCAN_HPP_ {%- endmacro %} {% macro kernel_configuration(measurement) -%} -scan_config_v2<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}, ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_store_method::block_store_transpose, {{ measurement['cfg']['method'] }}> { }; +scan_config<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}, ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_store_method::block_store_transpose, {{ measurement['cfg']['method'] }}> { }; {%- endmacro %} {% macro general_case() -%} diff --git a/scripts/autotune/templates/scanbykey_config_template b/scripts/autotune/templates/scanbykey_config_template index c59432354..e17a89de7 100644 --- a/scripts/autotune/templates/scanbykey_config_template +++ b/scripts/autotune/templates/scanbykey_config_template @@ -5,7 +5,7 @@ ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_SCAN_BY_KEY_HPP_ {%- endmacro %} {% macro kernel_configuration(measurement) -%} -scan_by_key_config_v2<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}, ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_store_method::block_store_transpose, {{ measurement['cfg']['method'] }}> { }; +scan_by_key_config<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}, ::rocprim::block_load_method::block_load_transpose, ::rocprim::block_store_method::block_store_transpose, {{ measurement['cfg']['method'] }}> { }; {%- endmacro %} {% macro general_case() -%} diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt index 78e039a95..8db2d5c5b 100644 --- a/test/rocprim/CMakeLists.txt +++ b/test/rocprim/CMakeLists.txt @@ -234,6 +234,7 @@ add_rocprim_test("rocprim.block_sort_merge_stable" test_block_sort_merge_stable. add_rocprim_test_parallel("rocprim.block_radix_rank" test_block_radix_rank.cpp.in) add_rocprim_test("rocprim.block_radix_sort" test_block_radix_sort.cpp) add_rocprim_test("rocprim.block_reduce" test_block_reduce.cpp) +add_rocprim_test("rocprim.block_run_length_decode" test_block_run_length_decode.cpp) add_rocprim_test_parallel("rocprim.block_scan" test_block_scan.cpp.in) add_rocprim_test("rocprim.block_shuffle" test_block_shuffle.cpp) add_rocprim_test("rocprim.block_sort_bitonic" test_block_sort_bitonic.cpp) diff --git a/test/rocprim/test_block_exchange.kernels.hpp b/test/rocprim/test_block_exchange.kernels.hpp index f3783c9ff..5c56fff43 100644 --- a/test/rocprim/test_block_exchange.kernels.hpp +++ b/test/rocprim/test_block_exchange.kernels.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -172,15 +172,12 @@ void scatter_to_striped_kernel(Type* device_input, OutputType* device_output, un } // Test for exchange -template< - class T, - class U, - int Method, - unsigned int BlockSize = 256U, - unsigned int ItemsPerThread = 1U -> -auto test_block_exchange() --> typename std::enable_if::type +template +auto test_block_exchange(int /*device_id*/) -> typename std::enable_if::type { using type = T; using output_type = U; @@ -256,15 +253,12 @@ auto test_block_exchange() HIP_CHECK(hipFree(device_output)); } -template< - class T, - class U, - int Method, - unsigned int BlockSize = 256U, - unsigned int ItemsPerThread = 1U -> -auto test_block_exchange() --> typename std::enable_if::type +template +auto test_block_exchange(int /*device_id*/) -> typename std::enable_if::type { using type = T; using output_type = U; @@ -340,15 +334,12 @@ auto test_block_exchange() HIP_CHECK(hipFree(device_output)); } -template< - class T, - class U, - int Method, - unsigned int BlockSize = 256U, - unsigned int ItemsPerThread = 1U -> -auto test_block_exchange() --> typename std::enable_if::type +template +auto test_block_exchange(int device_id) -> typename std::enable_if::type { using type = T; using output_type = U; @@ -367,8 +358,11 @@ auto test_block_exchange() std::vector expected(size); std::vector output(size, output_type(0)); - const size_t warp_size = std::min(block_size, size_t(::rocprim::host_warp_size())); - const size_t warps_no = (block_size + warp_size - 1) / warp_size; + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); + + const size_t warp_size = std::min(block_size, size_t(current_device_warp_size)); + const size_t warps_no = (block_size + warp_size - 1) / warp_size; const size_t items_per_warp = warp_size * items_per_thread; // Calculate input and expected results on host @@ -436,15 +430,12 @@ auto test_block_exchange() HIP_CHECK(hipFree(device_output)); } -template< - class T, - class U, - int Method, - unsigned int BlockSize = 256U, - unsigned int ItemsPerThread = 1U -> -auto test_block_exchange() --> typename std::enable_if::type +template +auto test_block_exchange(int device_id) -> typename std::enable_if::type { using type = T; using output_type = U; @@ -463,8 +454,11 @@ auto test_block_exchange() std::vector expected(size); std::vector output(size, output_type(0)); - const size_t warp_size = std::min(block_size, size_t(::rocprim::host_warp_size())); - const size_t warps_no = (block_size + warp_size - 1) / warp_size; + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); + + const size_t warp_size = std::min(block_size, size_t(current_device_warp_size)); + const size_t warps_no = (block_size + warp_size - 1) / warp_size; const size_t items_per_warp = warp_size * items_per_thread; // Calculate input and expected results on host @@ -530,15 +524,12 @@ auto test_block_exchange() HIP_CHECK(hipFree(device_output)); } -template< - class T, - class U, - int Method, - unsigned int BlockSize = 256U, - unsigned int ItemsPerThread = 1U -> -auto test_block_exchange() --> typename std::enable_if::type +template +auto test_block_exchange(int /*device_id*/) -> typename std::enable_if::type { using type = T; using output_type = U; @@ -632,15 +623,12 @@ auto test_block_exchange() HIP_CHECK(hipFree(device_ranks)); } -template< - class T, - class U, - int Method, - unsigned int BlockSize = 256U, - unsigned int ItemsPerThread = 1U -> -auto test_block_exchange() --> typename std::enable_if::type +template +auto test_block_exchange(int /*device_id*/) -> typename std::enable_if::type { using type = T; using output_type = U; @@ -753,7 +741,7 @@ struct static_for SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); HIP_CHECK(hipSetDevice(device_id)); - test_block_exchange(); + test_block_exchange(device_id); static_for::run(); } }; diff --git a/test/rocprim/test_block_radix_sort.cpp b/test/rocprim/test_block_radix_sort.cpp index f903cee50..bcc032b2b 100644 --- a/test/rocprim/test_block_radix_sort.cpp +++ b/test/rocprim/test_block_radix_sort.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -40,7 +40,7 @@ struct RocprimBlockRadixSort; struct Integral; #define suite_name RocprimBlockRadixSort -#define warp_params BlockParamsIntegral +#define warp_params BlockParamsIntegralExtended #define name_suffix Integral #include "test_block_radix_sort.hpp" diff --git a/test/rocprim/test_block_run_length_decode.cpp b/test/rocprim/test_block_run_length_decode.cpp new file mode 100644 index 000000000..c16853af7 --- /dev/null +++ b/test/rocprim/test_block_run_length_decode.cpp @@ -0,0 +1,290 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "../common_test_header.hpp" + +// required rocprim headers +#include +#include +#include +#include + +// required test headers +#include "rocprim/block/block_load_func.hpp" +#include "rocprim/block/block_store_func.hpp" +#include "rocprim/functional.hpp" +#include "test_utils_types.hpp" + +template +struct Params +{ + using item_type = ItemT; + using length_type = LengthT; + static constexpr unsigned block_size = BlockSize; + static constexpr unsigned runs_per_thread = RunsPerThread; + static constexpr unsigned decoded_items_per_thread = DecodedItemsPerThread; +}; + +template +class HipcubBlockRunLengthDecodeTest : public ::testing::Test +{ +public: + using params = Params; +}; + +using HipcubBlockRunLengthDecodeTestParams + = ::testing::Types, + Params, + Params, + Params, + Params, + Params, + + Params, + Params, + Params, + Params, + Params, + Params, + + Params, + Params, + Params, + Params, + Params, + Params, + + Params, + Params, + Params, + Params, + Params, + Params>; + +TYPED_TEST_SUITE(HipcubBlockRunLengthDecodeTest, HipcubBlockRunLengthDecodeTestParams); + +template +__global__ + __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT* d_run_items, + const LengthT* d_run_lengths, + ItemT* d_decoded_items, + LengthT* d_decoded_offsets) +{ + using BlockRunLengthDecodeT + = rocprim::block_run_length_decode; + + static constexpr unsigned int decoded_items_per_block = BlockSize * DecodedItemsPerThread; + + ROCPRIM_SHARED_MEMORY typename BlockRunLengthDecodeT::storage_type temp_storage; + + ItemT run_items[RunsPerThread]; + LengthT run_lengths[RunsPerThread]; + + const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; + rocprim::block_load_direct_blocked(global_thread_idx, d_run_items, run_items); + rocprim::block_load_direct_blocked(global_thread_idx, d_run_lengths, run_lengths); + + unsigned total_decoded_size{}; + BlockRunLengthDecodeT block_run_length_decode(temp_storage, + run_items, + run_lengths, + total_decoded_size); + + unsigned decoded_window_offset = 0; + while(decoded_window_offset < total_decoded_size) + { + ItemT decoded_items[DecodedItemsPerThread]; + LengthT decoded_offsets[DecodedItemsPerThread]; + + block_run_length_decode.run_length_decode(decoded_items, + decoded_offsets, + decoded_window_offset); + + rocprim::block_store_direct_blocked( + global_thread_idx, + d_decoded_items + decoded_window_offset, + decoded_items, + rocprim::minimum{}(total_decoded_size - decoded_window_offset, + decoded_items_per_block)); + + rocprim::block_store_direct_blocked( + global_thread_idx, + d_decoded_offsets + decoded_window_offset, + decoded_offsets, + rocprim::minimum{}(total_decoded_size - decoded_window_offset, + decoded_items_per_block)); + + decoded_window_offset += decoded_items_per_block; + } +} + +TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode) +{ + const int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using ItemT = typename TestFixture::params::item_type; + using LengthT = typename TestFixture::params::length_type; + constexpr unsigned block_size = TestFixture::params::block_size; + constexpr unsigned runs_per_thread = TestFixture::params::runs_per_thread; + constexpr unsigned decoded_items_per_thread = TestFixture::params::decoded_items_per_thread; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + size_t num_runs = runs_per_thread * block_size; + constexpr LengthT max_run_length = static_cast( + std::min(1000ll, static_cast(std::numeric_limits::max()))); + + auto run_items = std::vector(num_runs); + run_items[0] = test_utils::get_random_value(test_utils::numeric_limits::min(), + test_utils::numeric_limits::max(), + ++seed_value); + + size_t run_item_index = 1; + while(run_item_index < num_runs) + { + run_items[run_item_index] + = test_utils::get_random_value(test_utils::numeric_limits::min(), + test_utils::numeric_limits::max(), + ++seed_value); + if(test_utils::convert_to_native(run_items[run_item_index]) + != test_utils::convert_to_native(run_items[run_item_index - 1])) + { + ++run_item_index; + } + } + + auto run_lengths = test_utils::get_random_data(num_runs, + static_cast(1), + max_run_length, + seed_value); + + std::default_random_engine prng(seed_value); + std::uniform_int_distribution num_empty_runs_dist(1, 4); + const size_t num_trailing_empty_runs = num_empty_runs_dist(prng); + num_runs += num_trailing_empty_runs; + + const auto empty_run_items + = test_utils::get_random_data(num_trailing_empty_runs, + std::numeric_limits::min(), + std::numeric_limits::max(), + seed_value); + run_items.insert(run_items.end(), empty_run_items.begin(), empty_run_items.end()); + run_lengths.insert(run_lengths.end(), num_trailing_empty_runs, static_cast(0)); + + std::vector expected; + for(size_t i = 0; i < run_items.size(); ++i) + { + for(size_t j = 0; j < static_cast(run_lengths[i]); ++j) + { + expected.push_back(run_items[i]); + } + } + + ItemT* d_run_items{}; + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_run_items, run_items.size() * sizeof(ItemT))); + HIP_CHECK(hipMemcpy(d_run_items, + run_items.data(), + run_items.size() * sizeof(ItemT), + hipMemcpyHostToDevice)); + + LengthT* d_run_lengths{}; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_run_lengths, + run_lengths.size() * sizeof(LengthT))); + HIP_CHECK(hipMemcpy(d_run_lengths, + run_lengths.data(), + run_lengths.size() * sizeof(LengthT), + hipMemcpyHostToDevice)); + + ItemT* d_decoded_runs{}; + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_decoded_runs, expected.size() * sizeof(ItemT))); + + LengthT* d_decoded_offsets{}; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_decoded_offsets, + expected.size() * sizeof(LengthT))); + block_run_length_decode_kernel + <<>>(d_run_items, + d_run_lengths, + d_decoded_runs, + d_decoded_offsets); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + std::vector output(expected.size()); + HIP_CHECK(hipMemcpy(output.data(), + d_decoded_runs, + output.size() * sizeof(ItemT), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipGetLastError()) + + std::vector offsets(expected.size()); + HIP_CHECK(hipMemcpy(offsets.data(), + d_decoded_offsets, + offsets.size() * sizeof(LengthT), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipFree(d_run_items)); + HIP_CHECK(hipFree(d_run_lengths)); + HIP_CHECK(hipFree(d_decoded_runs)); + HIP_CHECK(hipFree(d_decoded_offsets)); + + unsigned int expected_offset = -1; + ItemT previous_value = ItemT{}; + for(size_t i = 0; i < output.size(); ++i) + { + ASSERT_EQ(test_utils::convert_to_native(output[i]), + test_utils::convert_to_native(expected[i])); + if(test_utils::convert_to_native(output[i]) + != test_utils::convert_to_native(previous_value)) + { + previous_value = output[i]; + expected_offset = 0; + } + else + { + expected_offset = ++expected_offset; + } + + ASSERT_EQ(offsets[i], expected_offset); + } + } +} diff --git a/test/rocprim/test_device_adjacent_difference.cpp b/test/rocprim/test_device_adjacent_difference.cpp index 63d7f2269..784b7b950 100644 --- a/test/rocprim/test_device_adjacent_difference.cpp +++ b/test/rocprim/test_device_adjacent_difference.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -36,28 +36,6 @@ namespace { -template -struct size_limit_config -{ - static constexpr unsigned int item_scale - = ::rocprim::detail::ceiling_div(sizeof(T), sizeof(int)); - - using type - = rocprim::adjacent_difference_config<256, - ::rocprim::max(1u, 16u / item_scale), - rocprim::block_load_method::block_load_transpose, - rocprim::block_store_method::block_store_transpose, - SizeLimit>; -}; - -template -struct size_limit_config -{ - using type = rocprim::default_config; -}; - -template -using size_limit_config_t = typename size_limit_config::type; template & input, } // namespace // Params for tests -template +template + struct DeviceAdjacentDifferenceParams { using input_type = InputType; @@ -159,7 +138,7 @@ struct DeviceAdjacentDifferenceParams static constexpr bool left = Left; static constexpr bool in_place = InPlace; static constexpr bool use_identity_iterator = UseIdentityIterator; - static constexpr size_t size_limit = SizeLimit; + using config = Config; }; template @@ -172,25 +151,38 @@ class RocprimDeviceAdjacentDifferenceTests : public ::testing::Test static constexpr bool in_place = Params::in_place; static constexpr bool use_identity_iterator = Params::use_identity_iterator; static constexpr bool debug_synchronous = false; - static constexpr size_t size_limit = Params::size_limit; + using config = typename Params::config; }; using custom_double2 = test_utils::custom_test_type; using custom_int64_array = test_utils::custom_test_array_type; +using custom_config_0 = rocprim::adjacent_difference_config<128, 4>; + +template +using custom_size_limit_config + = rocprim::adjacent_difference_config<1024, + 2, + rocprim::block_load_method::block_load_transpose, + rocprim::block_store_method::block_store_transpose, + SizeLimit>; + using RocprimDeviceAdjacentDifferenceTestsParams = ::testing::Types< + // Tests with default configuration DeviceAdjacentDifferenceParams, DeviceAdjacentDifferenceParams, DeviceAdjacentDifferenceParams, DeviceAdjacentDifferenceParams, - DeviceAdjacentDifferenceParams, - DeviceAdjacentDifferenceParams, - DeviceAdjacentDifferenceParams>; + DeviceAdjacentDifferenceParams, + DeviceAdjacentDifferenceParams, + DeviceAdjacentDifferenceParams, + // Tests for supported config structs + DeviceAdjacentDifferenceParams, + DeviceAdjacentDifferenceParams, + // Tests for different size_limits + DeviceAdjacentDifferenceParams>, + DeviceAdjacentDifferenceParams>, + DeviceAdjacentDifferenceParams>>; TYPED_TEST_SUITE(RocprimDeviceAdjacentDifferenceTests, RocprimDeviceAdjacentDifferenceTestsParams); @@ -206,7 +198,7 @@ TYPED_TEST(RocprimDeviceAdjacentDifferenceTests, AdjacentDifference) static constexpr bool in_place = TestFixture::in_place; static constexpr bool use_identity_iterator = TestFixture::use_identity_iterator; static constexpr bool debug_synchronous = TestFixture::debug_synchronous; - using Config = size_limit_config_t; + using Config = typename TestFixture::config; SCOPED_TRACE(testing::Message() << "left = " << left << ", in_place = " << in_place); diff --git a/test/rocprim/test_device_binary_search.cpp b/test/rocprim/test_device_binary_search.cpp index b94138221..e7370ab61 100644 --- a/test/rocprim/test_device_binary_search.cpp +++ b/test/rocprim/test_device_binary_search.cpp @@ -52,37 +52,24 @@ class RocprimDeviceBinarySearch : public ::testing::Test { using custom_int2 = test_utils::custom_test_type; using custom_double2 = test_utils::custom_test_type; -using custom_config_0 = rocprim::transform_config<128, 4>; -using custom_config_1 = rocprim::binary_search_config<64, 2>; -struct custom_config_2 -{ - static constexpr unsigned int block_size = 256; - static constexpr unsigned int items_per_thread = 1; - static constexpr unsigned int size_limit = ROCPRIM_GRID_SIZE_LIMIT; -}; - -typedef ::testing::Types, - params, - custom_config_0>, - params>, - params, - params, - params, - params>, - params, - custom_config_1>, - params, - params, - custom_config_2>> +struct use_custom_config +{}; + +typedef ::testing::Types< + params, + params>, + params>, + params, + params, + params, + params>, + params, + use_custom_config>, + params, + params>> Params; TYPED_TEST_SUITE(RocprimDeviceBinarySearch, Params); @@ -97,7 +84,10 @@ TYPED_TEST(RocprimDeviceBinarySearch, LowerBound) using needle_type = typename TestFixture::params::needle_type; using output_type = typename TestFixture::params::output_type; using compare_op_type = typename TestFixture::params::compare_op_type; - using config = typename TestFixture::params::config; + using config = std::conditional_t< + std::is_same::value, + rocprim::lower_bound_config<64, 2>, + typename TestFixture::params::config>; hipStream_t stream = 0; @@ -218,7 +208,10 @@ TYPED_TEST(RocprimDeviceBinarySearch, UpperBound) using needle_type = typename TestFixture::params::needle_type; using output_type = typename TestFixture::params::output_type; using compare_op_type = typename TestFixture::params::compare_op_type; - using config = typename TestFixture::params::config; + using config = std::conditional_t< + std::is_same::value, + rocprim::upper_bound_config<64, 2>, + typename TestFixture::params::config>; hipStream_t stream = 0; @@ -338,7 +331,10 @@ TYPED_TEST(RocprimDeviceBinarySearch, BinarySearch) using needle_type = typename TestFixture::params::needle_type; using output_type = typename TestFixture::params::output_type; using compare_op_type = typename TestFixture::params::compare_op_type; - using config = typename TestFixture::params::config; + using config = std::conditional_t< + std::is_same::value, + rocprim::binary_search_config<64, 2>, + typename TestFixture::params::config>; hipStream_t stream = 0; diff --git a/test/rocprim/test_device_histogram.cpp b/test/rocprim/test_device_histogram.cpp index 03cf47762..20c9c92c1 100644 --- a/test/rocprim/test_device_histogram.cpp +++ b/test/rocprim/test_device_histogram.cpp @@ -100,16 +100,6 @@ struct transform_op } }; -// provides the same members as rocprim::histogram_config -struct user_config -{ - using histogram = ::rocprim::kernel_config<256, 1>; - - static constexpr unsigned int max_grid_size = 1024; - static constexpr unsigned int shared_impl_max_bins = 2048; - static constexpr unsigned int shared_impl_histograms = 3; -}; - template, params1, params1, - params1, + params1, params1, params1> Params1; diff --git a/test/rocprim/test_device_radix_sort.cpp.in b/test/rocprim/test_device_radix_sort.cpp.in index bd37a9fb5..b56e85889 100644 --- a/test/rocprim/test_device_radix_sort.cpp.in +++ b/test/rocprim/test_device_radix_sort.cpp.in @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -49,6 +49,10 @@ #endif #if ROCPRIM_TEST_TYPE_SLICE == 0 +#if defined(__GNUC__) || defined(__clang__) + INSTANTIATE(params<__int128_t, __int128_t>) + INSTANTIATE(params<__uint128_t, __uint128_t>) +#endif INSTANTIATE(params) INSTANTIATE(params) INSTANTIATE(params) diff --git a/test/rocprim/test_device_radix_sort.hpp b/test/rocprim/test_device_radix_sort.hpp index 628d0f73b..0b7e81620 100644 --- a/test/rocprim/test_device_radix_sort.hpp +++ b/test/rocprim/test_device_radix_sort.hpp @@ -139,10 +139,10 @@ inline void sort_keys() test_utils::key_comparator()); // Use arbitrary custom config to increase test coverage without making more test cases - using config = rocprim::radix_sort_config_v2; + using config = rocprim::radix_sort_config; size_t temporary_storage_bytes; HIP_CHECK(rocprim::radix_sort_keys(nullptr, @@ -319,7 +319,7 @@ inline void sort_pairs() } // Use arbitrary custom config to increase test coverage without making more test cases - using config = rocprim::radix_sort_config_v2< + using config = rocprim::radix_sort_config< rocprim::kernel_config<256, 1>, rocprim::merge_sort_config<128, 64, 2, 128, 64, 2>, rocprim::radix_sort_onesweep_config, diff --git a/test/rocprim/test_device_reduce_by_key.cpp b/test/rocprim/test_device_reduce_by_key.cpp index 38d728c89..c0a9d8285 100644 --- a/test/rocprim/test_device_reduce_by_key.cpp +++ b/test/rocprim/test_device_reduce_by_key.cpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -543,3 +543,95 @@ TEST(RocprimDeviceReduceByKey, LargeSegmentCountReduceByKeyLargeValueType) // large value type to test TilesPerBlock > 1 large_segment_count_reduce_by_key>(); } + +TEST(RocprimDeviceReduceByKey, ReduceByNonEqualKeys) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using key_type = size_t; + using value_type = unsigned int; + + const bool debug_synchronous = false; + + ::rocprim::plus reduce_op; + auto key_compare_op = [](const auto&, const auto&) { return false; }; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + for(size_t block_size_multiple : test_utils::get_block_size_multiples(seed_value, 256)) + { + const size_t size = block_size_multiple + 1; + + SCOPED_TRACE(testing::Message() << "with size = " << size); + + hipStream_t stream = 0; // default + + // Using segments of size 1. + auto d_keys_input = rocprim::make_counting_iterator(key_type(0)); + + // Setting all values to 1, so the reduction will contain the size of the input array. + auto d_values_input = rocprim::constant_iterator(1); + + size_t unique_count_expected = size; + + // Discard all output + auto d_unique_output = rocprim::make_discard_iterator(); + auto d_aggregates_output = rocprim::make_discard_iterator(); + + size_t* d_unique_count_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_unique_count_output, sizeof(size_t))); + + size_t temporary_storage_bytes; + + HIP_CHECK(rocprim::reduce_by_key(nullptr, + temporary_storage_bytes, + d_keys_input, + d_values_input, + size, + d_unique_output, + d_aggregates_output, + d_unique_count_output, + reduce_op, + key_compare_op, + stream, + debug_synchronous)); + + ASSERT_GT(temporary_storage_bytes, 0); + + void* d_temporary_storage; + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); + + HIP_CHECK(rocprim::reduce_by_key(d_temporary_storage, + temporary_storage_bytes, + d_keys_input, + d_values_input, + size, + d_unique_output, + d_aggregates_output, + d_unique_count_output, + reduce_op, + key_compare_op, + stream, + debug_synchronous)); + + HIP_CHECK(hipFree(d_temporary_storage)); + + size_t unique_count_output; + HIP_CHECK(hipMemcpy(&unique_count_output, + d_unique_count_output, + sizeof(unique_count_output), + hipMemcpyDeviceToHost)); + + HIP_CHECK(hipFree(d_unique_count_output)); + + ASSERT_EQ(unique_count_output, unique_count_expected); + } + } +} diff --git a/test/rocprim/test_device_scan.cpp b/test/rocprim/test_device_scan.cpp index 5f7aed218..4277b1cd5 100644 --- a/test/rocprim/test_device_scan.cpp +++ b/test/rocprim/test_device_scan.cpp @@ -43,43 +43,24 @@ struct default_config_helper using type = ::rocprim::default_config; }; -struct user_config_helper -{ - // provides the same members as rocprim::scan_config and rocprim::scan_by_key_config - template - struct type - { - static constexpr unsigned int block_size = 256; - static constexpr unsigned int items_per_thread = 4; - static constexpr bool use_lookback = false; - static constexpr ::rocprim::block_load_method block_load_method - = ::rocprim::block_load_method::default_method; - static constexpr ::rocprim::block_store_method block_store_method - = ::rocprim::block_store_method::default_method; - static constexpr ::rocprim::block_scan_algorithm block_scan_method - = ::rocprim::block_scan_algorithm::default_algorithm; - static constexpr unsigned int size_limit = ROCPRIM_GRID_SIZE_LIMIT; - }; -}; - template struct size_limit_config_helper { template using type = std::conditional_t< ByKey, - rocprim::scan_by_key_config_v2<256, - 16, - rocprim::block_load_method::block_load_transpose, - rocprim::block_store_method::block_store_transpose, - rocprim::block_scan_algorithm::using_warp_scan, - SizeLimit>, - rocprim::scan_config_v2<256, - 16, - rocprim::block_load_method::block_load_transpose, - rocprim::block_store_method::block_store_transpose, - rocprim::block_scan_algorithm::using_warp_scan, - SizeLimit>>; + rocprim::scan_by_key_config<256, + 16, + rocprim::block_load_method::block_load_transpose, + rocprim::block_store_method::block_store_transpose, + rocprim::block_scan_algorithm::using_warp_scan, + SizeLimit>, + rocprim::scan_config<256, + 16, + rocprim::block_load_method::block_load_transpose, + rocprim::block_store_method::block_store_transpose, + rocprim::block_scan_algorithm::using_warp_scan, + SizeLimit>>; }; // Params for tests @@ -127,14 +108,14 @@ typedef ::testing::Types< DeviceScanParams, false, size_limit_config_helper<524288>>, DeviceScanParams, false, size_limit_config_helper<1048576>>, DeviceScanParams>, - DeviceScanParams, false, user_config_helper>, + DeviceScanParams, false>, DeviceScanParams>, DeviceScanParams>, DeviceScanParams>, DeviceScanParams>, // Large DeviceScanParams>, - DeviceScanParams, false, user_config_helper>, + DeviceScanParams, false>, DeviceScanParams>, DeviceScanParams>, DeviceScanParams>, diff --git a/test/rocprim/test_intrinsics.cpp b/test/rocprim/test_intrinsics.cpp index 66751bb39..138705677 100644 --- a/test/rocprim/test_intrinsics.cpp +++ b/test/rocprim/test_intrinsics.cpp @@ -163,7 +163,7 @@ struct test_type_helper for(size_t i = 0; i < result.size(); ++i) { result[i].i = static_cast(random_data[i * 3]); - result[i].u = static_cast(random_data[i * 4 + 1]); + result[i].u = static_cast(random_data[i * 3 + 1]); result[i].f = random_data[i * 3 + 2]; } @@ -206,12 +206,13 @@ T bit_extract(const T value, const unsigned int bits) return bits == bit_size ? value : value & ((T{1} << bits) - T{1}); } -std::vector active_lanes_tests() +std::vector active_lanes_tests(int device_id) { std::vector tests = {all_lanes_active, 0x0123'4567'89AB'CDEF, 0xAAAA'AAAA'AAAA'AAAA}; - const size_t hardware_warp_size = ::rocprim::host_warp_size(); + unsigned int hardware_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size)); for(auto& test : tests) { test = bit_extract(test, hardware_warp_size); @@ -262,7 +263,8 @@ void test_shuffle() SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); HIP_CHECK(hipSetDevice(device_id)); - const size_t hardware_warp_size = ::rocprim::host_warp_size(); + unsigned int hardware_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size)); const size_t size = hardware_warp_size; SCOPED_TRACE(testing::Message() << "with hardware_warp_size = " << hardware_warp_size); @@ -313,7 +315,7 @@ void test_shuffle() auto input = test_type_helper::get_random_data(size, seed_value); std::vector output(input.size()); - for(const auto active_lanes : active_lanes_tests()) + for(const auto active_lanes : active_lanes_tests(device_id)) { SCOPED_TRACE(testing::Message() << "with active_lanes = " << std::bitset<64>(active_lanes)); @@ -405,7 +407,8 @@ TYPED_TEST(RocprimIntrinsicsTests, ShuffleIndex) HIP_CHECK(hipSetDevice(device_id)); using T = typename TestFixture::type; - const size_t hardware_warp_size = ::rocprim::host_warp_size(); + unsigned int hardware_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size)); const size_t size = hardware_warp_size; for (size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) @@ -512,7 +515,9 @@ TEST(RocprimIntrinsicsTests, LaneId) SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); HIP_CHECK(hipSetDevice(device_id)); - const size_t hardware_warp_size = ::rocprim::host_warp_size(); + unsigned int hardware_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size)); + const size_t warps_per_block = 4; const size_t block_size = warps_per_block * hardware_warp_size; const size_t blocks = 2; @@ -571,7 +576,9 @@ TEST(RocprimIntrinsicsTests, MaskedBitCount) SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); HIP_CHECK(hipSetDevice(device_id)); - const size_t hardware_warp_size = ::rocprim::host_warp_size(); + unsigned int hardware_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size)); + const size_t warps_per_block = 4; const size_t block_size = warps_per_block * hardware_warp_size; const size_t blocks = 2; @@ -616,7 +623,7 @@ TEST(RocprimIntrinsicsTests, MaskedBitCount) { SCOPED_TRACE(testing::Message() << "with add = " << add); - for(const auto active_lanes : active_lanes_tests()) + for(const auto active_lanes : active_lanes_tests(device_id)) { SCOPED_TRACE(testing::Message() << "with active_lanes = " << std::bitset<64>(active_lanes)); @@ -696,7 +703,9 @@ void warp_any_all_test() SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); HIP_CHECK(hipSetDevice(device_id)); - const size_t hardware_warp_size = ::rocprim::host_warp_size(); + unsigned int hardware_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size)); + const size_t warps_per_block = 4; const size_t block_size = warps_per_block * hardware_warp_size; const size_t blocks = 2; @@ -729,7 +738,7 @@ void warp_any_all_test() all_lanes_active - 1, seed_value); - for(const auto active_lanes : active_lanes_tests()) + for(const auto active_lanes : active_lanes_tests(device_id)) { SCOPED_TRACE(testing::Message() << "with active_lanes = " << std::bitset<64>(active_lanes)); @@ -822,7 +831,9 @@ TYPED_TEST(RocprimIntrinsicsTests, WarpPermute) using T = typename TestFixture::type; - const size_t hardware_warp_size = ::rocprim::host_warp_size(); + unsigned int hardware_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size)); + const size_t warps_per_block = 4; const size_t block_size = warps_per_block * hardware_warp_size; const size_t blocks = 2; @@ -880,7 +891,7 @@ TYPED_TEST(RocprimIntrinsicsTests, WarpPermute) const auto wrap = test_utils::get_random_data(size, 0, 4, seed_value); - for(const auto active_lanes : active_lanes_tests()) + for(const auto active_lanes : active_lanes_tests(device_id)) { SCOPED_TRACE(testing::Message() << "with active_lanes = " << std::bitset<64>(active_lanes)); @@ -942,14 +953,15 @@ TYPED_TEST(RocprimIntrinsicsTests, WarpPermute) template __global__ void match_any_kernel(max_lane_mask_type* output, unsigned int* input, - max_lane_mask_type active_lanes) + max_lane_mask_type active_lanes, + max_lane_mask_type lane_predicates) { const unsigned int index = blockIdx.x * blockDim.x + threadIdx.x; - const auto value = input[index]; max_lane_mask_type result = test_type_helper::uninitialized(); if(is_lane_active(active_lanes, rocprim::lane_id())) - result = rocprim::match_any(value); + result = rocprim::match_any(input[index], + is_lane_active(lane_predicates, rocprim::lane_id())); output[index] = result; } @@ -959,7 +971,9 @@ TEST(RocprimIntrinsicsTests, MatchAny) SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); HIP_CHECK(hipSetDevice(device_id)); - const size_t hardware_warp_size = ::rocprim::host_warp_size(); + unsigned int hardware_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size)); + const size_t warps_per_block = 4; const size_t block_size = warps_per_block * hardware_warp_size; const size_t blocks = 2; @@ -987,63 +1001,72 @@ TEST(RocprimIntrinsicsTests, MatchAny) 1u << (label_bits + 3), seed_value); - for(const auto active_lanes : active_lanes_tests()) + const auto active_lanes_for_testing = active_lanes_tests(device_id); + for(const auto& active_lanes : active_lanes_for_testing) { - SCOPED_TRACE(testing::Message() - << "with active_lanes = " << std::bitset<64>(active_lanes)); - - for(size_t block = 0; block < blocks; ++block) + for(const auto& lane_predicates : active_lanes_for_testing) { - for(size_t warp = 0; warp < warps_per_block; ++warp) - { - const auto base = (block * warps_per_block + warp) * hardware_warp_size; - std::vector histogram(1u << label_bits, 0); + SCOPED_TRACE(testing::Message() + << "with lane_predicates = " << std::bitset<64>(lane_predicates)); + SCOPED_TRACE(testing::Message() + << "with active_lanes = " << std::bitset<64>(active_lanes)); - for(size_t lane = 0; lane < hardware_warp_size; ++lane) + for(size_t block = 0; block < blocks; ++block) + { + for(size_t warp = 0; warp < warps_per_block; ++warp) { - if(is_lane_active(active_lanes, lane)) - { - const auto value = bit_extract(input[base + lane], label_bits); - histogram[value] |= max_lane_mask_type{1} << lane; - } - } + const auto base = (block * warps_per_block + warp) * hardware_warp_size; + std::vector histogram(1u << label_bits, 0); - for(size_t lane = 0; lane < hardware_warp_size; ++lane) - { - if(is_lane_active(active_lanes, lane)) + for(size_t lane = 0; lane < hardware_warp_size; ++lane) { - const auto value = bit_extract(input[base + lane], label_bits); - expected[base + lane] = histogram[value]; + if(is_lane_active(active_lanes, lane) + && is_lane_active(lane_predicates, lane)) + { + const auto value = bit_extract(input[base + lane], label_bits); + histogram[value] |= max_lane_mask_type{1} << lane; + } } - else + + for(size_t lane = 0; lane < hardware_warp_size; ++lane) { - expected[base + lane] = test_type_helper::uninitialized(); + if(!is_lane_active(active_lanes, lane)) + { + expected[base + lane] + = test_type_helper::uninitialized(); + continue; + } + + const auto value = bit_extract(input[base + lane], label_bits); + expected[base + lane] + = is_lane_active(lane_predicates, lane) ? histogram[value] : 0; } } } - } - HIP_CHECK(hipMemcpy(d_input, - input.data(), - size * sizeof(unsigned int), - hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_input, + input.data(), + size * sizeof(unsigned int), + hipMemcpyHostToDevice)); - hipLaunchKernelGGL(HIP_KERNEL_NAME(match_any_kernel), - dim3(blocks), - dim3(block_size), - 0, - hipStreamDefault, - d_output, - d_input, - active_lanes); - HIP_CHECK(hipGetLastError()); + hipLaunchKernelGGL(HIP_KERNEL_NAME(match_any_kernel), + dim3(blocks), + dim3(block_size), + 0, + hipStreamDefault, + d_output, + d_input, + active_lanes, + lane_predicates); + HIP_CHECK(hipGetLastError()); - HIP_CHECK(hipMemcpy(output.data(), - d_output, - size * sizeof(max_lane_mask_type), - hipMemcpyDeviceToHost)); + HIP_CHECK(hipMemcpy(output.data(), + d_output, + size * sizeof(max_lane_mask_type), + hipMemcpyDeviceToHost)); - test_utils::assert_eq(output, expected); + test_utils::assert_eq(output, expected); + } } } @@ -1069,7 +1092,9 @@ TEST(RocprimIntrinsicsTests, Ballot) SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); HIP_CHECK(hipSetDevice(device_id)); - const size_t hardware_warp_size = ::rocprim::host_warp_size(); + unsigned int hardware_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size)); + const size_t warps_per_block = 4; const size_t block_size = warps_per_block * hardware_warp_size; const size_t blocks = 2; @@ -1093,7 +1118,7 @@ TEST(RocprimIntrinsicsTests, Ballot) const auto input = test_utils::get_random_data01(size, 0.5f, seed_value); - for(const auto active_lanes : active_lanes_tests()) + for(const auto active_lanes : active_lanes_tests(device_id)) { SCOPED_TRACE(testing::Message() << "with active_lanes = " << std::bitset<64>(active_lanes)); @@ -1146,3 +1171,125 @@ TEST(RocprimIntrinsicsTests, Ballot) hipFree(d_input); hipFree(d_output); } + +__global__ void group_elect_kernel(max_lane_mask_type* output, + max_lane_mask_type* input, + size_t warps_per_block) +{ + const unsigned int input_index = blockIdx.x * blockDim.x + threadIdx.x; + + const unsigned int output_index + = blockIdx.x * warps_per_block + threadIdx.x / ::rocprim::device_warp_size(); + + if(rocprim::group_elect(input[input_index])) + { + atomicOr(&output[output_index], max_lane_mask_type{1} << ::rocprim::lane_id()); + } +} + +TEST(RocprimIntrinsicsTests, GroupElect) +{ + const int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + unsigned int hardware_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, hardware_warp_size)); + const size_t warps_per_block = 4; + const size_t block_size = warps_per_block * hardware_warp_size; + const size_t blocks = 48; + const size_t number_of_warps = blocks * warps_per_block; + SCOPED_TRACE(testing::Message() << "with hardware_warp_size = " << hardware_warp_size); + + max_lane_mask_type* d_input; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_input, blocks * block_size * sizeof(*d_input))); + + max_lane_mask_type* d_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, number_of_warps * sizeof(*d_output))); + + std::vector output; + output.reserve(number_of_warps); + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + std::vector input(blocks * block_size, 0); + std::vector> warp_histograms(blocks * warps_per_block); + + auto input_it = input.begin(); + for(size_t block = 0; block < blocks; ++block) + { + for(size_t warp = 0; warp < warps_per_block; ++warp) + { + const std::vector group_labels + = test_utils::get_random_data(hardware_warp_size, + 0, + hardware_warp_size, + seed_value + warp); + + auto& histogram = warp_histograms[block * warps_per_block + warp]; + histogram.assign(hardware_warp_size + 1, 0); + for(size_t lane = 0; lane < hardware_warp_size; ++lane) + { + const unsigned label = group_labels[lane]; + histogram[label] |= max_lane_mask_type{1} << lane; + } + + input_it + = std::transform(group_labels.begin(), + group_labels.end(), + input_it, + [&](unsigned int label) + { + // Mark some lanes as invalid (not part of any group) + return label < hardware_warp_size ? histogram[label] : 0; + }); + } + } + + output.assign(number_of_warps, 0); + + HIP_CHECK(hipMemcpy(d_input, + input.data(), + blocks * block_size * sizeof(*d_input), + hipMemcpyHostToDevice)); + + HIP_CHECK(hipMemset(d_output, 0, number_of_warps * sizeof(*d_output))); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(group_elect_kernel), + dim3(blocks), + dim3(block_size), + 0, + hipStreamDefault, + d_output, + d_input, + warps_per_block); + HIP_CHECK(hipGetLastError()); + + HIP_CHECK(hipMemcpy(output.data(), + d_output, + number_of_warps * sizeof(output[0]), + hipMemcpyDeviceToHost)); + + for(size_t i = 0; i < blocks * block_size; ++i) + { + const auto group_mask = input[i]; + const auto warp_output = output[i / hardware_warp_size]; + if(group_mask > 0) + { + const max_lane_mask_type group_elect = group_mask & warp_output; + ASSERT_TRUE(rocprim::detail::is_power_of_two(group_elect)); + } + else + { + ASSERT_EQ(warp_output & (max_lane_mask_type{1} << (i % hardware_warp_size)), 0); + } + } + } + + hipFree(d_input); + hipFree(d_output); +} diff --git a/test/rocprim/test_utils.hpp b/test/rocprim/test_utils.hpp index 522c309e3..573b5cbb9 100644 --- a/test/rocprim/test_utils.hpp +++ b/test/rocprim/test_utils.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -21,11 +21,12 @@ #ifndef TEST_TEST_UTILS_HPP_ #define TEST_TEST_UTILS_HPP_ -#include +#include +#include #include #include #include -#include +#include // Identity iterator #include "identity_iterator.hpp" @@ -432,16 +433,16 @@ void iota(ForwardIt first, ForwardIt last, T value) } } -#define SKIP_IF_UNSUPPORTED_WARP_SIZE(test_warp_size) { \ - const auto host_warp_size = ::rocprim::host_warp_size(); \ - if (host_warp_size < (test_warp_size)) \ - { \ - GTEST_SKIP() << "Cannot run test of warp size " \ - << (test_warp_size) \ - << " on a device with warp size " \ - << host_warp_size; \ - } \ -} +#define SKIP_IF_UNSUPPORTED_WARP_SIZE(test_warp_size, device_id) \ + { \ + unsigned int host_warp_size; \ + HIP_CHECK(::rocprim::host_warp_size(device_id, host_warp_size)); \ + if(host_warp_size < (test_warp_size)) \ + { \ + GTEST_SKIP() << "Cannot run test of warp size " << (test_warp_size) \ + << " on a device with warp size " << host_warp_size; \ + } \ + } template struct DeviceSelectWarpSize diff --git a/test/rocprim/test_utils_assertions.hpp b/test/rocprim/test_utils_assertions.hpp index cb5e713c3..3e9ff4e93 100644 --- a/test/rocprim/test_utils_assertions.hpp +++ b/test/rocprim/test_utils_assertions.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -240,6 +240,81 @@ void assert_bit_eq(const std::vector& result, const std::vector& expected) } } +#if defined(__GNUC__) || defined(__clang__) +inline void assert_bit_eq(const std::vector<__int128_t>& result, + const std::vector<__int128_t>& expected) +{ + ASSERT_EQ(result.size(), expected.size()); + + auto to_string = [](__int128_t value) + { + static const char* charmap = "0123456789"; + + std::string result; + result.reserve(41); // max. 40 digits possible ( uint64_t has 20) plus sign + __uint128_t helper = (value < 0) ? -value : value; + + do + { + result += charmap[helper % 10]; + helper /= 10; + } + while(helper); + if(value < 0) + { + result += "-"; + } + std::reverse(result.begin(), result.end()); + return result; + }; + + for(size_t i = 0; i < result.size(); i++) + { + if(!bit_equal(result[i], expected[i])) + { + FAIL() << "Expected strict/bitwise equality of these values: " << std::endl + << " result[i]: " << to_string(result[i]) << std::endl + << " expected[i]: " << to_string(expected[i]) << std::endl + << "where index = " << i; + } + } +} + +inline void assert_bit_eq(const std::vector<__uint128_t>& result, + const std::vector<__uint128_t>& expected) +{ + ASSERT_EQ(result.size(), expected.size()); + + auto to_string = [](__uint128_t value) + { + static const char* charmap = "0123456789"; + + std::string result; + result.reserve(40); // max. 40 digits possible ( uint64_t has 20) + __uint128_t helper = value; + + do + { + result += charmap[helper % 10]; + helper /= 10; + } + while(helper); + std::reverse(result.begin(), result.end()); + return result; + }; + + for(size_t i = 0; i < result.size(); i++) + { + if(!bit_equal(result[i], expected[i])) + { + FAIL() << "Expected strict/bitwise equality of these values: " << std::endl + << " result[i]: " << to_string(result[i]) << std::endl + << " expected[i]: " << to_string(expected[i]) << std::endl + << "where index = " << i; + } + } +} +#endif } #endif //ROCPRIM_TEST_UTILS_ASSERTIONS_HPP diff --git a/test/rocprim/test_utils_data_generation.hpp b/test/rocprim/test_utils_data_generation.hpp index 4c8881f12..74ea08440 100644 --- a/test/rocprim/test_utils_data_generation.hpp +++ b/test/rocprim/test_utils_data_generation.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2021-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -23,10 +23,16 @@ // Std::memcpy and std::memcmp #include +#include -#include "test_utils_half.hpp" +#include +#include +#include + +#include "common_test_header.hpp" #include "test_utils_bfloat16.hpp" #include "test_utils_custom_test_types.hpp" +#include "test_utils_half.hpp" namespace test_utils { @@ -113,6 +119,23 @@ template<> class numeric_limits : public std::numeric_limi }; // End of extended numeric_limits +// Converts possible device side types to their relevant host side native types +inline rocprim::native_half convert_to_native(const rocprim::half& value) +{ + return rocprim::native_half(value); +} + +inline rocprim::native_bfloat16 convert_to_native(const rocprim::bfloat16& value) +{ + return rocprim::native_bfloat16(value); +} + +template +inline auto convert_to_native(const T& value) +{ + return value; +} + // Helper class to generate a vector of special values for any type template struct special_values { @@ -159,6 +182,90 @@ void add_special_values(std::vector& source, seed_type seed_value) } } +template +inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) -> + typename std::enable_if::value, std::vector>::type +{ + engine_type gen{seed_value}; + using dis_type = typename std::conditional< + is_valid_for_int_distribution::value, + T, + typename std::conditional::value, int, unsigned int>::type>::type; + std::uniform_int_distribution distribution(static_cast(min), + static_cast(max)); + std::vector data(size); + size_t segment_size = size / random_data_generation_segments; + if(segment_size != 0) + { + for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; + segment_index++) + { + if(segment_index % random_data_generation_repeat_strides == 0) + { + T repeated_value = static_cast(distribution(gen)); + std::fill(data.begin() + segment_size * segment_index, + data.begin() + segment_size * (segment_index + 1), + repeated_value); + } + else + { + std::generate(data.begin() + segment_size * segment_index, + data.begin() + segment_size * (segment_index + 1), + [&]() { return static_cast(distribution(gen)); }); + } + } + } + else + { + std::generate(data.begin(), + data.end(), + [&]() { return static_cast(distribution(gen)); }); + } + return data; +} + +template +inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) -> + typename std::enable_if::value, std::vector>::type +{ + engine_type gen{seed_value}; + using dis_type = typename std::conditional< + is_valid_for_int_distribution::value, + T, + typename std::conditional::value, int, unsigned int>::type>::type; + std::uniform_int_distribution distribution(static_cast(min), + static_cast(max)); + std::vector data(size); + size_t segment_size = size / random_data_generation_segments; + if(segment_size != 0) + { + for(uint32_t segment_index = 0; segment_index < random_data_generation_segments; + segment_index++) + { + if(segment_index % random_data_generation_repeat_strides == 0) + { + T repeated_value = static_cast(distribution(gen)); + std::fill(data.begin() + segment_size * segment_index, + data.begin() + segment_size * (segment_index + 1), + repeated_value); + } + else + { + std::generate(data.begin() + segment_size * segment_index, + data.begin() + segment_size * (segment_index + 1), + [&]() { return static_cast(distribution(gen)); }); + } + } + } + else + { + std::generate(data.begin(), + data.end(), + [&]() { return static_cast(distribution(gen)); }); + } + return data; +} + template inline auto get_random_data(size_t size, U min, V max, seed_type seed_value) -> typename std::enable_if::value, std::vector>::type @@ -431,6 +538,33 @@ std::vector get_large_sizes(T seed_value) std::sort(sizes.begin(), sizes.end()); return sizes; } + +/// \brief Computes the closest multiple of \p divisor to a certain \p ref. +/// \param ref Number to be rounded up. +/// \param divisor Number which closest multiple to \p ref we are looking for. +inline size_t closest_greater_multiple(const size_t ref, const size_t divisor) +{ + if(!divisor) + { + return ref; + } + const size_t remainder = ref % divisor; + size_t distance = remainder ? divisor - remainder : 0; + return ref + distance; +} + +template +std::vector get_block_size_multiples(T seed_value, const unsigned int block_size) +{ + std::vector sizes = get_sizes(seed_value); + std::transform(sizes.begin(), + sizes.end(), + sizes.begin(), + [block_size](size_t size) + { return test_utils::closest_greater_multiple(size, block_size); }); + std::set unique_sizes(sizes.begin(), sizes.end()); + return std::vector(unique_sizes.begin(), unique_sizes.end()); +} } #endif //ROCPRIM_TEST_UTILS_DATA_GENERATION_HPP diff --git a/test/rocprim/test_utils_sort_comparator.hpp b/test/rocprim/test_utils_sort_comparator.hpp index 71eed3cc7..fa7be26ce 100644 --- a/test/rocprim/test_utils_sort_comparator.hpp +++ b/test/rocprim/test_utils_sort_comparator.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -65,6 +65,46 @@ struct key_comparator +struct key_comparator::value>::type> +{ + static constexpr Key radix_mask_upper + = EndBit == 8 * sizeof(Key) ? ~Key(0) : (Key(1) << EndBit) - 1; + static constexpr Key radix_mask_bottom = (Key(1) << StartBit) - 1; + static constexpr Key radix_mask = radix_mask_upper ^ radix_mask_bottom; + + bool operator()(const Key& lhs, const Key& rhs) const + { + Key l = lhs & radix_mask; + Key r = rhs & radix_mask; + return Descending ? (r < l) : (l < r); + } +}; + +template +struct key_comparator::value>::type> +{ + static constexpr Key radix_mask_upper + = EndBit == 8 * sizeof(Key) ? ~Key(0) : (Key(1) << EndBit) - 1; + static constexpr Key radix_mask_bottom = (Key(1) << StartBit) - 1; + static constexpr Key radix_mask = radix_mask_upper ^ radix_mask_bottom; + + bool operator()(const Key& lhs, const Key& rhs) const + { + Key l = lhs & radix_mask; + Key r = rhs & radix_mask; + return Descending ? (r < l) : (l < r); + } +}; + template struct key_comparator BlockParamsIntegral; +typedef ::testing::Types), + block_param_type(uint8_t, short), + block_param_type(int8_t, float), + block_param_type(__uint128_t, short), + block_param_type(__int128_t, float)> + BlockParamsIntegralExtended; + typedef ::testing::Types< block_param_type(float, long), block_param_type(double, test_utils::custom_test_type), diff --git a/test/rocprim/test_warp_exchange.cpp b/test/rocprim/test_warp_exchange.cpp index 802dc362f..ba985a904 100644 --- a/test/rocprim/test_warp_exchange.cpp +++ b/test/rocprim/test_warp_exchange.cpp @@ -221,7 +221,8 @@ TYPED_TEST(WarpExchangeTest, WarpExchange) constexpr unsigned int block_size = warp_size; constexpr unsigned int items_count = items_per_thread * block_size; - SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size); + int device_id = test_common_utils::obtain_device_from_ctest(); + SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size, device_id); std::vector input(items_count); std::iota(input.begin(), input.end(), static_cast(0)); @@ -336,7 +337,8 @@ TYPED_TEST(WarpExchangeScatterTest, WarpExchangeScatter) constexpr unsigned int items_count = items_per_thread * block_size; using OffsetT = unsigned short; - SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size); + int device_id = test_common_utils::obtain_device_from_ctest(); + SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size, device_id); std::vector input(items_count); std::iota(input.begin(), input.end(), static_cast(0)); diff --git a/test/rocprim/test_warp_load.cpp b/test/rocprim/test_warp_load.cpp index cc1592664..e1a14cfa0 100644 --- a/test/rocprim/test_warp_load.cpp +++ b/test/rocprim/test_warp_load.cpp @@ -185,7 +185,8 @@ TYPED_TEST(WarpLoadTest, WarpLoad) constexpr unsigned int block_size = 1024; constexpr unsigned int items_count = items_per_thread * block_size; - SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size); + int device_id = test_common_utils::obtain_device_from_ctest(); + SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size, device_id); std::vector input(items_count); std::iota(input.begin(), input.end(), static_cast(0)); @@ -238,7 +239,8 @@ TYPED_TEST(WarpLoadTest, WarpLoadGuarded) constexpr unsigned int valid_items = warp_size / 4; constexpr T oob_default = std::numeric_limits::max(); - SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size); + int device_id = test_common_utils::obtain_device_from_ctest(); + SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size, device_id); std::vector input(items_count); std::iota(input.begin(), input.end(), static_cast(0)); diff --git a/test/rocprim/test_warp_reduce.hpp b/test/rocprim/test_warp_reduce.hpp index c3c4371f6..02da74dd3 100644 --- a/test/rocprim/test_warp_reduce.hpp +++ b/test/rocprim/test_warp_reduce.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -56,7 +56,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, ReduceSum) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; static constexpr unsigned int grid_size = 4; @@ -175,7 +176,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, AllReduceSum) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; static constexpr unsigned int grid_size = 4; @@ -298,7 +300,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, ReduceSumValid) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; static constexpr unsigned int grid_size = 4; @@ -419,7 +422,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, AllReduceSumValid) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; static constexpr unsigned int grid_size = 4; @@ -542,7 +546,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, ReduceCustomStruct) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; static constexpr unsigned int grid_size = 4; @@ -674,7 +679,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, HeadSegmentedReduceSum) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; static constexpr unsigned int grid_size = 4; @@ -829,7 +835,8 @@ typed_test_def(RocprimWarpReduceTests, name_suffix, TailSegmentedReduceSum) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; static constexpr unsigned int grid_size = 4; diff --git a/test/rocprim/test_warp_scan.hpp b/test/rocprim/test_warp_scan.hpp index 193a69459..4573b22f8 100644 --- a/test/rocprim/test_warp_scan.hpp +++ b/test/rocprim/test_warp_scan.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -56,7 +56,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, InclusiveScan) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; const unsigned int grid_size = 4; @@ -178,7 +179,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, InclusiveScanReduce) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; const unsigned int grid_size = 4; @@ -322,7 +324,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, ExclusiveScan) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; const unsigned int grid_size = 4; @@ -447,7 +450,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, ExclusiveReduceScan) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; const unsigned int grid_size = 4; @@ -599,7 +603,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, Scan) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; const unsigned int grid_size = 4; @@ -757,7 +762,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, ScanReduce) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; const unsigned int grid_size = 4; @@ -936,7 +942,8 @@ typed_test_def(RocprimWarpScanTests, name_suffix, InclusiveScanCustomType) ? rocprim::max(ws64, logical_warp_size * 4) : rocprim::max((ws64/logical_warp_size), 1) * logical_warp_size; - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); const size_t block_size = current_device_warp_size == ws32 ? block_size_ws32 : block_size_ws64; const unsigned int grid_size = 4; diff --git a/test/rocprim/test_warp_sort.hpp b/test/rocprim/test_warp_sort.hpp index a1835c830..b1da8d1b5 100644 --- a/test/rocprim/test_warp_sort.hpp +++ b/test/rocprim/test_warp_sort.hpp @@ -1,6 +1,6 @@ // MIT License // -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -41,7 +41,8 @@ typed_test_def(RocprimWarpSortShuffleBasedTests, name_suffix, Sort) static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); static constexpr size_t block_size = std::max(256U, logical_warp_size * 4); static constexpr unsigned int grid_size = 4; @@ -139,7 +140,8 @@ typed_test_def(RocprimWarpSortShuffleBasedTests, name_suffix, SortKeyInt) static constexpr size_t ws32 = size_t(ROCPRIM_WARP_SIZE_32); static constexpr size_t ws64 = size_t(ROCPRIM_WARP_SIZE_64); - const unsigned int current_device_warp_size = rocprim::host_warp_size(); + unsigned int current_device_warp_size; + HIP_CHECK(::rocprim::host_warp_size(device_id, current_device_warp_size)); static constexpr size_t block_size = std::max(256U, logical_warp_size * 4); static constexpr unsigned int grid_size = 4; diff --git a/test/rocprim/test_warp_store.cpp b/test/rocprim/test_warp_store.cpp index 872d96328..2d6a0430a 100644 --- a/test/rocprim/test_warp_store.cpp +++ b/test/rocprim/test_warp_store.cpp @@ -175,7 +175,8 @@ TYPED_TEST(WarpStoreTest, WarpLoad) constexpr unsigned int block_size = 1024; constexpr unsigned int items_count = items_per_thread * block_size; - SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size); + int device_id = test_common_utils::obtain_device_from_ctest(); + SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size, device_id); std::vector input(items_count); std::iota(input.begin(), input.end(), static_cast(0)); @@ -227,7 +228,8 @@ TYPED_TEST(WarpStoreTest, WarpStoreGuarded) constexpr unsigned items_count = items_per_thread * block_size; constexpr int valid_items = warp_size / 4; - SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size); + int device_id = test_common_utils::obtain_device_from_ctest(); + SKIP_IF_UNSUPPORTED_WARP_SIZE(warp_size, device_id); std::vector input(items_count); std::iota(input.begin(), input.end(), static_cast(0));