diff --git a/CHANGELOG.md b/CHANGELOG.md index ff6db71e0..ed41489b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ Documentation for rocPRIM is available at ### Additions +* Added the parallel `find_first_of` device function with autotuned configurations, this function is similar to `std::find_first_of`, it searches for the first occurrence of any of the provided elements. + ### Changes ### Fixes diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 03cada66c..cda46e4a3 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -136,6 +136,7 @@ add_rocprim_benchmark(benchmark_config_dispatch.cpp) add_rocprim_benchmark(benchmark_device_adjacent_difference.cpp) add_rocprim_benchmark(benchmark_device_batch_memcpy.cpp) add_rocprim_benchmark(benchmark_device_binary_search.cpp) +add_rocprim_benchmark(benchmark_device_find_first_of.cpp) add_rocprim_benchmark(benchmark_device_histogram.cpp) add_rocprim_benchmark(benchmark_device_merge.cpp) add_rocprim_benchmark(benchmark_device_merge_sort.cpp) diff --git a/benchmark/ConfigAutotuneSettings.cmake b/benchmark/ConfigAutotuneSettings.cmake index c014a5b57..8c18e334d 100644 --- a/benchmark/ConfigAutotuneSettings.cmake +++ b/benchmark/ConfigAutotuneSettings.cmake @@ -111,5 +111,9 @@ DataType;BlockSize;" PARENT_SCOPE) set(list_across_names "KeyType;ValueType;BlockSize;TilesPerBlock" PARENT_SCOPE) set(list_across "${LIMITED_TUNING_TYPES};${TUNING_TYPES};128 192 256 384 512;1 2" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@TilesPerBlock@" PARENT_SCOPE) + elseif(file STREQUAL "benchmark_device_find_first_of") + set(list_across_names "DataType;BlockSize" PARENT_SCOPE) + set(list_across "${LIMITED_TUNING_TYPES};32 64 128 256 512 1024" PARENT_SCOPE) + set(output_pattern_suffix "@DataType@_@BlockSize@" PARENT_SCOPE) endif() endfunction() diff --git a/benchmark/benchmark_device_find_first_of.cpp b/benchmark/benchmark_device_find_first_of.cpp new file mode 100644 index 000000000..e48a603df --- /dev/null +++ b/benchmark/benchmark_device_find_first_of.cpp @@ -0,0 +1,150 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "benchmark_device_find_first_of.parallel.hpp" +#include "benchmark_utils.hpp" + +// CmdParser +#include "cmdparser.hpp" + +// Google Benchmark +#include + +// HIP API +#include + +#include +#include + +#ifndef DEFAULT_BYTES +constexpr size_t DEFAULT_BYTES = size_t{1} << 27; // 128 MiB +#endif + +#define CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, FIRST_OCCURENCE) \ + { \ + const device_find_first_of_benchmark instance(KEYS_SIZE, FIRST_OCCURENCE); \ + REGISTER_BENCHMARK(benchmarks, size, seed, stream, instance); \ + } + +// clang-format off +#define CREATE_BENCHMARK0(TYPE, KEYS_SIZE) \ + { \ + CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, 0.1) \ + CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, 0.5) \ + CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, 1.0) \ + } + +#define CREATE_BENCHMARK(TYPE) \ + { \ + CREATE_BENCHMARK0(TYPE, 1) \ + CREATE_BENCHMARK0(TYPE, 10) \ + CREATE_BENCHMARK0(TYPE, 100) \ + CREATE_BENCHMARK0(TYPE, 1000) \ + CREATE_BENCHMARK0(TYPE, 10000) \ + } +// clang-format on + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.set_optional("name_format", + "name_format", + "human", + "either: json,human,txt"); + parser.set_optional("seed", "seed", "random", get_seed_message()); +#ifdef BENCHMARK_CONFIG_TUNING + // optionally run an evenly split subset of benchmarks, when making multiple program invocations + parser.set_optional("parallel_instance", + "parallel_instance", + 0, + "parallel instance index"); + parser.set_optional("parallel_instances", + "parallel_instances", + 1, + "total parallel instances"); +#endif + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + bench_naming::set_format(parser.get("name_format")); + const std::string seed_type = parser.get("seed"); + const managed_seed seed(seed_type); + + // HIP + hipStream_t stream = 0; // default + + // Benchmark info + add_common_benchmark_info(); + benchmark::AddCustomContext("size", std::to_string(size)); + benchmark::AddCustomContext("seed", seed_type); + + // Add benchmarks + std::vector benchmarks{}; +#ifdef BENCHMARK_CONFIG_TUNING + const int parallel_instance = parser.get("parallel_instance"); + const int parallel_instances = parser.get("parallel_instances"); + config_autotune_register::register_benchmark_subset(benchmarks, + parallel_instance, + parallel_instances, + size, + seed, + stream); +#else // BENCHMARK_CONFIG_TUNING + CREATE_BENCHMARK(int8_t) + CREATE_BENCHMARK(int16_t) + CREATE_BENCHMARK(int32_t) + CREATE_BENCHMARK(float) + CREATE_BENCHMARK(int64_t) + CREATE_BENCHMARK(double) + + using custom_int2 = custom_type; + using custom_longlong_double = custom_type; + + CREATE_BENCHMARK(custom_int2) + CREATE_BENCHMARK(custom_longlong_double) +#endif // BENCHMARK_CONFIG_TUNING + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } + } + + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; +} diff --git a/benchmark/benchmark_device_find_first_of.parallel.cpp.in b/benchmark/benchmark_device_find_first_of.parallel.cpp.in new file mode 100644 index 000000000..230f6640c --- /dev/null +++ b/benchmark/benchmark_device_find_first_of.parallel.cpp.in @@ -0,0 +1,30 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "benchmark_device_find_first_of.parallel.hpp" +#include "benchmark_utils.hpp" + +namespace +{ +auto benchmarks = config_autotune_register::create_bulk( + device_find_first_of_benchmark_generator<@DataType@, @BlockSize@>::create); +} // namespace diff --git a/benchmark/benchmark_device_find_first_of.parallel.hpp b/benchmark/benchmark_device_find_first_of.parallel.hpp new file mode 100644 index 000000000..5681fe69e --- /dev/null +++ b/benchmark/benchmark_device_find_first_of.parallel.hpp @@ -0,0 +1,282 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ +#define ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ + +#include "benchmark_utils.hpp" + +// Google Benchmark +#include + +// HIP API +#include + +// rocPRIM +#include + +#include +#include +#include + +template +std::string config_name() +{ + const rocprim::detail::find_first_of_config_params config = Config(); + return "{bs:" + std::to_string(config.kernel_config.block_size) + + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + "}"; +} + +template<> +inline std::string config_name() +{ + return "default_config"; +} + +template +struct device_find_first_of_benchmark : public config_autotune_interface +{ + std::vector keys_sizes; + std::vector first_occurrences; + + device_find_first_of_benchmark(size_t keys_size, double first_occurrence) + { + keys_sizes.push_back(keys_size); + first_occurrences.push_back(first_occurrence); + } + + device_find_first_of_benchmark(const std::vector& keys_sizes, + const std::vector& first_occurrences) + { + this->keys_sizes = keys_sizes; + this->first_occurrences = first_occurrences; + } + + std::string name() const override + { + using namespace std::string_literals; + return bench_naming::format_name( + "{lvl:device,algo:find_first_of,"s + + (keys_sizes.size() == 1 ? "keys_size:"s + std::to_string(keys_sizes[0]) : ""s) + + (first_occurrences.size() == 1 + ? ",first_occurrence:"s + std::to_string(first_occurrences[0]) + : ""s) + + ",value_type:"s + std::string(Traits::name()) + ",cfg:" + config_name() + + "}"); + } + + static constexpr unsigned int batch_size = 10; + static constexpr unsigned int warmup_size = 2; + + void run(benchmark::State& state, + size_t bytes, + const managed_seed& seed, + hipStream_t stream) const override + { + using type = T; + using key_type = T; + using output_type = size_t; + + const size_t size = bytes / sizeof(type); + + const size_t max_keys_size = *std::max_element(keys_sizes.begin(), keys_sizes.end()); + + // Generate data + std::vector key_input + = get_random_data(max_keys_size, 0, 100, seed.get_0()); + std::vector input + = get_random_data(size, 101, generate_limits::max(), seed.get_0()); + + std::vector d_inputs(first_occurrences.size()); + for(size_t fi = 0; fi < first_occurrences.size(); ++fi) + { + type* d_input; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); + HIP_CHECK(hipMemcpyAsync(d_input, + input.data(), + input.size() * sizeof(*d_input), + hipMemcpyHostToDevice, + stream)); + // Set the first occurrence of keys in input + const size_t p = static_cast(size * first_occurrences[fi]); + if(p < size) + { + const type key = key_input[0]; + HIP_CHECK(hipMemcpyAsync(d_input + p, + &key, + sizeof(*d_input), + hipMemcpyHostToDevice, + stream)); + } + d_inputs[fi] = d_input; + } + + key_type* d_key_input; + output_type* d_output; + HIP_CHECK(hipMalloc(&d_key_input, max_keys_size * sizeof(*d_key_input))); + HIP_CHECK(hipMalloc(&d_output, sizeof(*d_output))); + + HIP_CHECK(hipMemcpy(d_key_input, + key_input.data(), + key_input.size() * sizeof(*d_key_input), + hipMemcpyHostToDevice)); + + ::rocprim::equal_to compare_op; + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + + auto run = [&](size_t key_size, const type* d_input) + { + HIP_CHECK(rocprim::find_first_of(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_key_input, + d_output, + input.size(), + key_size, + compare_op, + stream)); + }; + + size_t max_temporary_storage_bytes = 0; + for(size_t keys_size : keys_sizes) + { + run(keys_size, d_inputs[0]); + max_temporary_storage_bytes + = std::max(max_temporary_storage_bytes, temporary_storage_bytes); + } + temporary_storage_bytes = max_temporary_storage_bytes; + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + for(size_t fi = 0; fi < first_occurrences.size(); ++fi) + { + for(size_t keys_size : keys_sizes) + { + run(keys_size, d_inputs[fi]); + } + } + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + for(auto _ : state) + { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + for(size_t i = 0; i < batch_size; i++) + { + for(size_t fi = 0; fi < first_occurrences.size(); ++fi) + { + for(size_t keys_size : keys_sizes) + { + run(keys_size, d_inputs[fi]); + } + } + } + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + + // Destroy HIP events + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); + + // Only a part of data (before the first occurrence) must be actually processed. In ideal + // cases when no thread blocks do unneeded work (i.e. exit early once the match is found), + // performance for different values of first_occurrence must be similar. + size_t sum_effective_size = 0; + for(double first_occurrence : first_occurrences) + { + sum_effective_size += static_cast(size * first_occurrence); + } + size_t sum_keys_size = 0; + for(size_t keys_size : keys_sizes) + { + sum_keys_size += keys_size; + } + state.SetBytesProcessed(state.iterations() * batch_size * sum_effective_size + * sizeof(*d_inputs[0])); + state.SetItemsProcessed(state.iterations() * batch_size * sum_effective_size); + // Each input is read once but all keys are read by all threads so performance is likely + // compute-bound or bound by cache bandwidth for reading keys rather than reading inputs. + // Let's additionally report the rate of comparisons to see if it reaches a plateau with + // increasing keys_size. + state.counters["comparisons_per_second"] + = benchmark::Counter(static_cast(state.iterations() * batch_size + * sum_effective_size * sum_keys_size), + benchmark::Counter::kIsRate); + + for(size_t fi = 0; fi < first_occurrences.size(); ++fi) + { + HIP_CHECK(hipFree(d_inputs[fi])); + } + HIP_CHECK(hipFree(d_key_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temporary_storage)); + } +}; + +template +struct device_find_first_of_benchmark_generator +{ + + template + struct create_ipt + { + using generated_config = rocprim::find_first_of_config; + + void operator()(std::vector>& storage) + { + std::vector keys_sizes{1, 10, 100, 1000}; + std::vector first_occurrences{0.1, 0.5, 1.0}; + storage.emplace_back( + std::make_unique>( + keys_sizes, + first_occurrences)); + } + }; + + static void create(std::vector>& storage) + { + static constexpr unsigned int min_items_per_thread = 1; + static constexpr unsigned int max_items_per_thread = 16; + static_for_each, + create_ipt>(storage); + } +}; + +#endif // ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ diff --git a/docs/device_ops/find_first_of.rst b/docs/device_ops/find_first_of.rst new file mode 100644 index 000000000..1781ae067 --- /dev/null +++ b/docs/device_ops/find_first_of.rst @@ -0,0 +1,19 @@ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-find_first_of: + + +Find first of +------------- + +Configuring the kernel +~~~~~~~~~~~~~~~~~~~~~~ + +.. doxygenstruct:: rocprim::find_first_of_config + +find_first_of +~~~~~~~~~~~~~ + +.. doxygenfunction:: rocprim::find_first_of(void* temporary_storage, size_t& storage_size, InputIterator1 input, InputIterator2 keys, OutputIterator output, size_t size, size_t keys_size, BinaryFunction compare_function = BinaryFunction(), hipStream_t stream = 0, bool debug_synchronous = false) diff --git a/docs/device_ops/index.rst b/docs/device_ops/index.rst index 3c27a1c15..74db4ee48 100644 --- a/docs/device_ops/index.rst +++ b/docs/device_ops/index.rst @@ -25,3 +25,4 @@ * :ref:`dev-memcpy` * :ref:`dev-nth_element` * :ref:`dev-partial_sort` + * :ref:`dev-find_first_of` diff --git a/docs/reference/ops_summary.rst b/docs/reference/ops_summary.rst index 9dbf13d68..9121e2e31 100644 --- a/docs/reference/ops_summary.rst +++ b/docs/reference/ops_summary.rst @@ -50,6 +50,11 @@ Data Movement * ``load`` the complementary operations of the above ones. * ``memcpy`` copies bytes between device sources and destinations +Sequence Search +=============== + +* ``find_first_of`` searches for the first occurrence of any of the provided elements. + Other operations ====================== diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 29e2bf154..b270acd5c 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -36,6 +36,7 @@ subtrees: - file: device_ops/histogram.rst - file: device_ops/device_copy.rst - file: device_ops/memcpy.rst + - file: device_ops/find_first_of.rst - file: block_ops/index.rst subtrees: - entries: diff --git a/rocprim/include/rocprim/device/detail/config/device_find_first_of.hpp b/rocprim/include/rocprim/device/detail/config/device_find_first_of.hpp new file mode 100644 index 000000000..ce06277a8 --- /dev/null +++ b/rocprim/include/rocprim/device/detail/config/device_find_first_of.hpp @@ -0,0 +1,263 @@ +// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_FIND_FIRST_OF_HPP_ +#define ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_FIND_FIRST_OF_HPP_ + +#include "../../../type_traits.hpp" +#include "../device_config_helper.hpp" + +#include + +/* DO NOT EDIT THIS FILE + * This file is automatically generated by `/scripts/autotune/create_optimization.py`. + * so most likely you want to edit rocprim/device/device_(algo)_config.hpp + */ + +/// \addtogroup primitivesmodule_deviceconfigs +/// @{ + +BEGIN_ROCPRIM_NAMESPACE + +namespace detail +{ + +template +struct default_find_first_of_config : default_find_first_of_config_base::type +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<256, 12> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 12> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx1030), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<64, 15> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1100), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 9> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1100), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<128, 13> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1100), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 9> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx1100), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<64, 13> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 15> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<1024, 14> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<64, 13> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx906), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<256, 11> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 8> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 11> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx908), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 8> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 11> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::unknown), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 8> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 11> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx90a), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<256, 10> +{}; + +} // end namespace detail + +END_ROCPRIM_NAMESPACE + +/// @} +// end of group primitivesmodule_deviceconfigs + +#endif // ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_FIND_FIRST_OF_HPP_ diff --git a/rocprim/include/rocprim/device/detail/device_config_helper.hpp b/rocprim/include/rocprim/device/detail/device_config_helper.hpp index 74374f011..095f4f50f 100644 --- a/rocprim/include/rocprim/device/detail/device_config_helper.hpp +++ b/rocprim/include/rocprim/device/detail/device_config_helper.hpp @@ -1082,6 +1082,46 @@ struct nth_element_config : public detail::nth_element_config_params #endif }; +namespace detail +{ + +struct find_first_of_config_params +{ + kernel_config_params kernel_config{}; +}; + +} // namespace detail + +/// \brief Configuration of device-level find_first_of +/// +/// \tparam BlockSize number of threads in a block. +/// \tparam ItemsPerThread number of items processed by each thread. +template +struct find_first_of_config : public detail::find_first_of_config_params +{ +#ifndef DOXYGEN_SHOULD_SKIP_THIS + constexpr find_first_of_config() + : detail::find_first_of_config_params{ + {BlockSize, ItemsPerThread, 0} + } + {} +#endif +}; + +namespace detail +{ + +template +struct default_find_first_of_config_base +{ + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + + using type = find_first_of_config<256, ::rocprim::max(1u, 16u / item_scale)>; +}; + +} // namespace detail + END_ROCPRIM_NAMESPACE /// @} diff --git a/rocprim/include/rocprim/device/device_find_first_of.hpp b/rocprim/include/rocprim/device/device_find_first_of.hpp new file mode 100644 index 000000000..80080b4f7 --- /dev/null +++ b/rocprim/include/rocprim/device/device_find_first_of.hpp @@ -0,0 +1,396 @@ +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_HPP_ +#define ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_HPP_ + +#include "../config.hpp" +#include "../detail/temp_storage.hpp" +#include "config_types.hpp" +#include "detail/ordered_block_id.hpp" +#include "device_find_first_of_config.hpp" +#include "device_transform.hpp" + +#include +#include +#include +#include +#include + +BEGIN_ROCPRIM_NAMESPACE + +namespace detail +{ + +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + do \ + { \ + hipError_t _error = hipGetLastError(); \ + if(_error != hipSuccess) \ + return _error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + hipError_t __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) \ + return __error; \ + auto _end = std::chrono::steady_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ + } \ + } \ + while(0) + +ROCPRIM_KERNEL +static void + init_find_first_of_kernel(size_t* output, size_t size, ordered_block_id ordered_bid) +{ + *output = size; + ordered_bid.reset(); +} + +template +ROCPRIM_KERNEL __launch_bounds__(device_params().kernel_config.block_size) +void find_first_of_kernel(InputIterator1 input, + InputIterator2 keys, + size_t* output, + size_t size, + size_t keys_size, + ordered_block_id ordered_bid, + BinaryFunction compare_function) +{ + constexpr find_first_of_config_params params = device_params(); + + constexpr unsigned int block_size = params.kernel_config.block_size; + constexpr unsigned int items_per_thread = params.kernel_config.items_per_thread; + constexpr unsigned int items_per_block = block_size * items_per_thread; + constexpr unsigned int identity = std::numeric_limits::max(); + + using type = typename std::iterator_traits::value_type; + using key_type = typename std::iterator_traits::value_type; + + const unsigned int thread_id = ::rocprim::detail::block_thread_id<0>(); + + ROCPRIM_SHARED_MEMORY struct + { + unsigned int block_first_index; + size_t global_first_index; + + typename decltype(ordered_bid)::storage_type ordered_bid; + } storage; + + if(thread_id == 0) + { + storage.block_first_index = identity; + } + syncthreads(); + + while(true) + { + if(thread_id == 0) + { + storage.global_first_index = atomic_load(output); + } + const size_t block_id = ordered_bid.get(thread_id, storage.ordered_bid); + const size_t block_offset = block_id * items_per_block; + // ordered_bid.get() calls syncthreads(), it is safe to read global_first_index + + // Exit if all input has been processed or one of previous blocks has found a match + if(block_offset >= storage.global_first_index) + { + break; + } + + unsigned int thread_first_index = identity; + + if(block_offset + items_per_block <= size) + { + type items[items_per_thread]; + block_load_direct_striped(thread_id, input + block_offset, items); + for(size_t key_index = 0; key_index < keys_size; ++key_index) + { + const key_type key = keys[key_index]; + ROCPRIM_UNROLL + for(unsigned int i = 0; i < items_per_thread; ++i) + { + if(compare_function(key, items[i])) + { + thread_first_index = min(thread_first_index, i); + } + } + } + } + else + { + const unsigned int valid = size - block_offset; + + type items[items_per_thread]; + block_load_direct_striped(thread_id, input + block_offset, items, valid); + for(size_t key_index = 0; key_index < keys_size; ++key_index) + { + const key_type key = keys[key_index]; + ROCPRIM_UNROLL + for(unsigned int i = 0; i < items_per_thread; ++i) + { + if(i * block_size + thread_id < valid && compare_function(key, items[i])) + { + thread_first_index = min(thread_first_index, i); + } + } + } + } + + if(thread_first_index != identity) + { + // This happens to some blocks rarely so it is not beneficial to avoid atomic conflicts + // with block_reduce which needs to be computed even if no threads have a match. + atomic_min(&storage.block_first_index, thread_first_index * block_size + thread_id); + } + syncthreads(); + if(storage.block_first_index != identity) + { + if(thread_id == 0) + { + atomic_min(output, block_offset + storage.block_first_index); + } + break; + } + } +} + +template +ROCPRIM_INLINE +hipError_t find_first_of_impl(void* temporary_storage, + size_t& storage_size, + InputIterator1 input, + InputIterator2 keys, + OutputIterator output, + size_t size, + size_t keys_size, + BinaryFunction compare_function, + hipStream_t stream, + bool debug_synchronous) +{ + using type = typename std::iterator_traits::value_type; + using config = wrapped_find_first_of_config; + + target_arch target_arch; + hipError_t result = host_target_arch(stream, target_arch); + if(result != hipSuccess) + { + return result; + } + const find_first_of_config_params params = dispatch_target_arch(target_arch); + + const unsigned int block_size = params.kernel_config.block_size; + const unsigned int items_per_thread = params.kernel_config.items_per_thread; + const unsigned int items_per_block = block_size * items_per_thread; + + using ordered_bid_type = ordered_block_id; + + // As output can be an arbitrary iterator, we need to use an intermediate buffer to do atomic + // operations with it + size_t* tmp_output = nullptr; + ordered_bid_type::id_type* ordered_bid_storage = nullptr; + + // Calculate required temporary storage + result = temp_storage::partition( + temporary_storage, + storage_size, + temp_storage::make_linear_partition( + temp_storage::ptr_aligned_array(&tmp_output, 1), + temp_storage::make_partition(&ordered_bid_storage, + ordered_bid_type::get_temp_storage_layout()))); + if(result != hipSuccess || temporary_storage == nullptr) + { + return result; + } + + auto ordered_bid = ordered_bid_type::create(ordered_bid_storage); + + std::chrono::steady_clock::time_point start; + + if(debug_synchronous) + { + start = std::chrono::steady_clock::now(); + } + init_find_first_of_kernel<<<1, 1, 0, stream>>>(tmp_output, size, ordered_bid); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("init_find_first_of_kernel", 1, start); + + if(size > 0 && keys_size > 0) + { + auto kernel = find_first_of_kernel; + + const size_t shared_memory_size = 0; + + // Choose minimum grid size needed to achieve the highest occupancy + int min_grid_size, max_block_size; + result = hipOccupancyMaxPotentialBlockSize(&min_grid_size, + &max_block_size, + kernel, + shared_memory_size, + int(block_size)); + if(result != hipSuccess) + { + return result; + } + + const size_t num_blocks + = std::min(size_t(min_grid_size), ceiling_div(size, items_per_block)); + + if(debug_synchronous) + { + start = std::chrono::steady_clock::now(); + } + kernel<<>>(input, + keys, + tmp_output, + size, + keys_size, + ordered_bid, + compare_function); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("find_first_of_kernel", size, start); + } + + return transform(tmp_output, output, 1, ::rocprim::identity(), stream, debug_synchronous); +} + +} // namespace detail + +/// \addtogroup devicemodule +/// @{ + +/// \brief Searches the range [input, input + size) for any of the elements in the range +/// [keys, keys + keys_size). +/// +/// \par Overview +/// * The contents of the inputs are not altered by the function. +/// * Returns the required size of `temporary_storage` in `storage_size` if `temporary_storage` is +// a null pointer. +/// * Accepts custom compare_function. +/// +/// \tparam Config [optional] configuration of the primitive. It has to be `find_first_of_config`. +/// \tparam InputIterator1 [inferred] random-access iterator type of the input range. Must meet the +/// requirements of a C++ InputIterator concept. It can be a simple pointer type. +/// \tparam InputIterator2 [inferred] random-access iterator type of the input range. Must meet the +/// requirements of a C++ InputIterator concept. It can be a simple pointer type. +/// \tparam OutputIterator [inferred] random-access iterator type of the output range. Must meet +/// the requirements of a C++ InputIterator concept. It can be a simple pointer type. +/// \tparam CompareFunction [inferred] Type of binary function that accepts two arguments of the +/// type `InputIterator1` and returns a value convertible to bool. Default type is +/// `::rocprim::equal_to<>.` +/// +/// \param [in] temporary_storage pointer to a device-accessible temporary storage. When +/// a null pointer is passed, the required allocation size (in bytes) is written to +/// `storage_size` and function returns without performing the search. +/// \param [in,out] storage_size reference to a size (in bytes) of `temporary_storage`. +/// \param [in] input iterator to the range of elements to examine. +/// \param [in] keys iterator to the range of elements to search for. +/// \param [out] output iterator to the output range. `output` should be able to be written for 1 +/// element. `*output` constains the position of the first element in the range +/// [input, input + size) that is equal to an element from the range [keys, keys + keys_size). +// If no such element is found, `*output` contains `size`. +/// \param [in] size number of elements to examine. +/// \param [in] keys_size number of elements to search for. +/// \param [in] compare_function binary operation function object that will be used for comparison. +/// The signature of the function should be equivalent to the following: +/// bool f(const T &a, const T &b);. The signature does not need to have +/// const &, but function object must not modify the objects passed to it. +/// \param [in] stream [optional] HIP stream object. Default is `0` (default stream). +/// \param [in] debug_synchronous [optional] If true, synchronization after every kernel +/// launch is forced in order to check for errors. Default value is `false`. +/// +/// \returns `hipSuccess` (`0`) after successful search; otherwise a HIP runtime error of +/// type `hipError_t`. +/// +/// \par Example +/// \parblock +/// In this example a device-level find_first_of is performed where inputs and keys are +/// represented by an array of unsigned integers. +/// +/// \code{.cpp} +/// #include +/// +/// // Prepare input and output (declare pointers, allocate device memory etc.) +/// size_t size; // e.g., 8 +/// size_t keys_size; // e.g., 2 +/// unsigned int* input; // e.g., [ 6, 3, 5, 4, 1, 8, 2, 7 ] +/// unsigned int* keys; // e.g., [ 10, 5 ] +/// unsigned int* keys_output; // 1 element +/// +/// size_t temporary_storage_size_bytes; +/// void * temporary_storage_ptr = nullptr; +/// // Get required size of the temporary storage +/// rocprim::find_first_of( +/// temporary_storage_ptr, temporary_storage_size_bytes, +/// input, keys, output, size, keys_size +/// ); +/// +/// // allocate temporary storage +/// hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes); +/// +/// // perform find_first_of +/// rocprim::find_first_of( +/// temporary_storage_ptr, temporary_storage_size_bytes, +/// input, keys, output, size, keys_size +/// ); +/// // output: [ 2 ] +/// \endcode +/// \endparblock +template::value_type>> +ROCPRIM_INLINE +hipError_t find_first_of(void* temporary_storage, + size_t& storage_size, + InputIterator1 input, + InputIterator2 keys, + OutputIterator output, + size_t size, + size_t keys_size, + BinaryFunction compare_function = BinaryFunction(), + hipStream_t stream = 0, + bool debug_synchronous = false) +{ + return detail::find_first_of_impl(temporary_storage, + storage_size, + input, + keys, + output, + size, + keys_size, + compare_function, + stream, + debug_synchronous); +} + +/// @} +// end of group devicemodule + +END_ROCPRIM_NAMESPACE + +#endif // ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_HPP_ diff --git a/rocprim/include/rocprim/device/device_find_first_of_config.hpp b/rocprim/include/rocprim/device/device_find_first_of_config.hpp new file mode 100644 index 000000000..26c0a30d7 --- /dev/null +++ b/rocprim/include/rocprim/device/device_find_first_of_config.hpp @@ -0,0 +1,78 @@ +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_CONFIG_HPP_ +#define ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_CONFIG_HPP_ + +#include "config_types.hpp" +#include "detail/config/device_find_first_of.hpp" +#include "detail/device_config_helper.hpp" + +/// \addtogroup primitivesmodule_deviceconfigs +/// @{ + +BEGIN_ROCPRIM_NAMESPACE + +namespace detail +{ + +// generic struct that instantiates custom configurations +template +struct wrapped_find_first_of_config +{ + template + struct architecture_config + { + static constexpr find_first_of_config_params params = Config{}; + }; +}; + +// specialized for rocprim::default_config, which instantiates the default_find_first_of_config +template +struct wrapped_find_first_of_config +{ + template + struct architecture_config + { + static constexpr find_first_of_config_params params + = default_find_first_of_config(Arch), Type>(); + }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +template +template +constexpr find_first_of_config_params + wrapped_find_first_of_config::architecture_config::params; + +template +template +constexpr find_first_of_config_params + wrapped_find_first_of_config::architecture_config::params; +#endif // DOXYGEN_SHOULD_SKIP_THIS + +} // namespace detail + +END_ROCPRIM_NAMESPACE + +/// @} +// end of group primitivesmodule_deviceconfigs + +#endif // ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_CONFIG_HPP_ diff --git a/rocprim/include/rocprim/intrinsics/atomic.hpp b/rocprim/include/rocprim/intrinsics/atomic.hpp index f9a8aa084..f0daea443 100644 --- a/rocprim/include/rocprim/intrinsics/atomic.hpp +++ b/rocprim/include/rocprim/intrinsics/atomic.hpp @@ -58,7 +58,25 @@ namespace detail } ROCPRIM_DEVICE ROCPRIM_INLINE - unsigned int atomic_wrapinc(unsigned int * address, unsigned int value) + unsigned int atomic_min(unsigned int* address, unsigned int value) + { + return ::atomicMin(address, value); + } + + ROCPRIM_DEVICE ROCPRIM_INLINE + unsigned long atomic_min(unsigned long* address, unsigned long value) + { + return ::atomicMin(address, value); + } + + ROCPRIM_DEVICE ROCPRIM_INLINE + unsigned long long atomic_min(unsigned long long* address, unsigned long long value) + { + return ::atomicMin(address, value); + } + + ROCPRIM_DEVICE ROCPRIM_INLINE + unsigned int atomic_wrapinc(unsigned int* address, unsigned int value) { return ::atomicInc(address, value); } diff --git a/rocprim/include/rocprim/intrinsics/warp.hpp b/rocprim/include/rocprim/intrinsics/warp.hpp index 5da311dc5..66d2e359c 100644 --- a/rocprim/include/rocprim/intrinsics/warp.hpp +++ b/rocprim/include/rocprim/intrinsics/warp.hpp @@ -52,19 +52,10 @@ unsigned int masked_bit_count(lane_mask_type x, unsigned int add = 0) int c; #ifndef __HIP_CPU_RT__ #if ROCPRIM_WAVEFRONT_SIZE == 32 - #ifdef __HIP__ c = ::__builtin_amdgcn_mbcnt_lo(x, add); - #else - c = ::__mbcnt_lo(x, add); - #endif #else - #ifdef __HIP__ c = ::__builtin_amdgcn_mbcnt_lo(static_cast(x), add); c = ::__builtin_amdgcn_mbcnt_hi(static_cast(x >> 32), c); - #else - c = ::__mbcnt_lo(static_cast(x), add); - c = ::__mbcnt_hi(static_cast(x >> 32), c); - #endif #endif #else using namespace hip::detail; diff --git a/scripts/autotune/create_optimization.py b/scripts/autotune/create_optimization.py index 130bdb3cf..2bebf1fd5 100755 --- a/scripts/autotune/create_optimization.py +++ b/scripts/autotune/create_optimization.py @@ -622,6 +622,14 @@ class AlgorithmDeviceReduceByKey(Algorithm): def __init__(self, fallback_entries): Algorithm.__init__(self, fallback_entries) +class AlgorithmDeviceFindFirstOf(Algorithm): + algorithm_name = "device_find_first_of" + cpp_configuration_template_name = "find_first_of_config_template" + config_selection_params = [ + SelectionType(name="value_type", is_optional=False, select_on_size_only=True)] + def __init__(self, fallback_entries): + Algorithm.__init__(self, fallback_entries) + def filt_algo_regex(e: FallbackCase, algorithm_name): if e.algo_regex: return re.match(e.algo_regex, algorithm_name) is not None @@ -679,6 +687,8 @@ def create_algorithm(algorithm_name: str, fallback_entries: List[FallbackCase]): return AlgorithmDeviceSelectUniqueByKey(fallback_entries) elif algorithm_name == 'device_reduce_by_key': return AlgorithmDeviceReduceByKey(fallback_entries) + elif algorithm_name == 'device_find_first_of': + return AlgorithmDeviceFindFirstOf(fallback_entries) else: raise(NotSupportedError(f'Algorithm "{algorithm_name}" is not supported (yet)')) diff --git a/scripts/autotune/templates/find_first_of_config_template b/scripts/autotune/templates/find_first_of_config_template new file mode 100644 index 000000000..84e7fb476 --- /dev/null +++ b/scripts/autotune/templates/find_first_of_config_template @@ -0,0 +1,20 @@ +{% extends "config_template" %} + +{% macro get_header_guard() %} +ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_FIND_FIRST_OF_HPP_ +{%- endmacro %} + +{% macro kernel_configuration(measurement) -%} +find_first_of_config<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}> { }; +{%- endmacro %} + +{% macro general_case() -%} +template +struct default_find_first_of_config : default_find_first_of_config_base::type +{}; +{%- endmacro %} + +{% macro configuration_fallback(benchmark_of_architecture, based_on_type, fallback_selection_criteria) -%} +// Based on {{ based_on_type }} +template struct default_find_first_of_config({{ benchmark_of_architecture.name }}), value_type, {{ fallback_selection_criteria }}> : +{%- endmacro %} diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt index 9d7f9a4b2..5307f6db6 100644 --- a/test/rocprim/CMakeLists.txt +++ b/test/rocprim/CMakeLists.txt @@ -258,6 +258,7 @@ add_rocprim_test("rocprim.constant_iterator" test_constant_iterator.cpp) add_rocprim_test("rocprim.counting_iterator" test_counting_iterator.cpp) add_rocprim_test("rocprim.device_batch_memcpy" test_device_batch_memcpy.cpp) add_rocprim_test("rocprim.device_binary_search" test_device_binary_search.cpp) +add_rocprim_test("rocprim.device_find_first_of" test_device_find_first_of.cpp) add_rocprim_test("rocprim.device_adjacent_difference" test_device_adjacent_difference.cpp) add_rocprim_test("rocprim.device_histogram" test_device_histogram.cpp) add_rocprim_test("rocprim.device_merge" test_device_merge.cpp) @@ -296,4 +297,3 @@ add_rocprim_test("rocprim.warp_scan" test_warp_scan.cpp) add_rocprim_test("rocprim.warp_sort" test_warp_sort.cpp) add_rocprim_test("rocprim.warp_store" test_warp_store.cpp) add_rocprim_test("rocprim.zip_iterator" test_zip_iterator.cpp) - diff --git a/test/rocprim/identity_iterator.hpp b/test/rocprim/identity_iterator.hpp index f5f6c9a3d..5219c1e59 100644 --- a/test/rocprim/identity_iterator.hpp +++ b/test/rocprim/identity_iterator.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -24,7 +24,7 @@ namespace test_utils { -// Output iterator used in tests to check situtations when +// Output iterator used in tests to check situations when // value_type of output iterator is void template class identity_iterator diff --git a/test/rocprim/indirect_iterator.hpp b/test/rocprim/indirect_iterator.hpp index 88e29b5fb..17688dae4 100644 --- a/test/rocprim/indirect_iterator.hpp +++ b/test/rocprim/indirect_iterator.hpp @@ -63,7 +63,7 @@ class reference_wrapper T* _ptr; }; -// Iterator used in tests to check situtations when value_type of the +// Iterator used in tests to check situations when value_type of the // iterator is not the same as the return type of operator[]. // It is a simplified version of device_vector::iterator from thrust. template diff --git a/test/rocprim/test_device_find_first_of.cpp b/test/rocprim/test_device_find_first_of.cpp new file mode 100644 index 000000000..ca3c7b9ce --- /dev/null +++ b/test/rocprim/test_device_find_first_of.cpp @@ -0,0 +1,392 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// required test headers +#include "indirect_iterator.hpp" +#include "test_utils_assertions.hpp" +#include "test_utils_custom_float_type.hpp" +#include "test_utils_custom_test_types.hpp" +#include "test_utils_data_generation.hpp" +#include "test_utils_types.hpp" + +#include "../common_test_header.hpp" + +// required rocprim headers +#include +// #include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +// Params for tests +template, + class Config = rocprim::default_config, + bool UseGraphs = false, + bool UseIndirectIterator = false> +struct DeviceFindFirstOfParams +{ + using type = Type; + using key_type = KeyType; + using output_type = OutputType; + using compare_function = CompareFunction; + using config = Config; + static constexpr bool use_graphs = UseGraphs; + static constexpr bool use_indirect_iterator = UseIndirectIterator; +}; + +struct custom_compare1 +{ + template + ROCPRIM_HOST_DEVICE ROCPRIM_INLINE + bool operator()(const T& a, const U& b) const + { + // Since data is random, the chance of equality is negligible for floating point numbers + return static_cast(a * 1.234) == static_cast(b * 1.234); + } +}; + +struct custom_compare2 +{ + template + ROCPRIM_HOST_DEVICE ROCPRIM_INLINE + bool operator()(test_utils::custom_test_type a, test_utils::custom_test_type b) + { + return a.x == b.x; + } +}; + +template +class RocprimDeviceFindFirstOfTests : public ::testing::Test +{ +public: + using type = typename Params::type; + using key_type = typename Params::key_type; + using output_type = typename Params::output_type; + using compare_function = typename Params::compare_function; + using config = typename Params::config; + static constexpr bool debug_synchronous = false; + static constexpr bool use_graphs = Params::use_graphs; + static constexpr bool use_indirect_iterator = Params::use_indirect_iterator; +}; + +using RocprimDeviceFindFirstOfTestsParams + = ::testing::Types, + DeviceFindFirstOfParams, + rocprim::default_config, + true, + true>, + DeviceFindFirstOfParams, + rocprim::default_config, + true, + false>, + DeviceFindFirstOfParams, + DeviceFindFirstOfParams, + test_utils::custom_test_type, + size_t, + custom_compare2, + rocprim::default_config, + false, + true>>; + +TYPED_TEST_SUITE(RocprimDeviceFindFirstOfTests, RocprimDeviceFindFirstOfTestsParams); + +TYPED_TEST(RocprimDeviceFindFirstOfTests, FindFirstOf) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using type = typename TestFixture::type; + using key_type = typename TestFixture::key_type; + using output_type = typename TestFixture::output_type; + using compare_function = typename TestFixture::compare_function; + using config = typename TestFixture::config; + + constexpr bool debug_synchronous = TestFixture::debug_synchronous; + constexpr bool use_indirect_iterator = TestFixture::use_indirect_iterator; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + for(size_t size : test_utils::get_sizes(seed_value)) + { + SCOPED_TRACE(testing::Message() << "with size = " << size); + + const size_t keys_size + = std::sqrt(test_utils::get_random_value(0, size, seed_value)); + + // Starting point is an appoximate position of the first match we want to test for + for(double starting_point : {0.0, 0.234, 0.876, 1.0, 100.0}) + { + SCOPED_TRACE(testing::Message() << "with starting_point = " << starting_point); + + hipStream_t stream = 0; // default + if(TestFixture::use_graphs) + { + // Default stream does not support hipGraph stream capture, so create one + HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + } + + // Generate data + auto keys = test_utils::get_random_data(keys_size, 0, 10, seed_value + 1); + + std::vector input(size); + // Generate the input data in such a way that it does not contain any values from + // keys before the starting point + const size_t size1 + = starting_point >= 1.0 ? size : static_cast(size * starting_point); + const size_t size2 = size - size1; + if(size1 > 0) + { + auto input1 = test_utils::get_random_data(size1, 20, 100, seed_value + 2); + std::copy(input1.begin(), input1.end(), input.begin()); + } + if(size2 > 0) + { + auto input2 = test_utils::get_random_data(size2, 0, 100, seed_value + 3); + std::copy(input2.begin(), input2.end(), input.begin() + size1); + } + + // Explicitly test for boundary cases + if(size > 0 && keys_size > 0) + { + if(starting_point == 0.0) + { + input[0] = keys[keys_size - 1]; + } + else if(starting_point == 1.0) + { + input[size - 1] = keys[0]; + } + } + + type* d_input; + key_type* d_keys; + output_type* d_output; + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(*d_input))); + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_keys, keys.size() * sizeof(*d_keys))); + HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, sizeof(*d_output))); + + HIP_CHECK(hipMemcpy(d_input, + input.data(), + input.size() * sizeof(*d_input), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_keys, + keys.data(), + keys.size() * sizeof(*d_keys), + hipMemcpyHostToDevice)); + + const auto input_it + = test_utils::wrap_in_indirect_iterator(d_input); + const auto keys_it + = test_utils::wrap_in_indirect_iterator(d_keys); + + // compare function + compare_function compare_op; + + // temp storage + size_t temp_storage_size_bytes; + void* d_temp_storage = nullptr; + // Get size of d_temp_storage + HIP_CHECK(rocprim::find_first_of(d_temp_storage, + temp_storage_size_bytes, + input_it, + keys_it, + d_output, + input.size(), + keys.size(), + compare_op, + stream, + debug_synchronous)); + + // temp_storage_size_bytes must be >0 + ASSERT_GT(temp_storage_size_bytes, 0); + + // allocate temporary storage + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); + + hipGraph_t graph; + if(TestFixture::use_graphs) + { + graph = test_utils::createGraphHelper(stream); + } + + // Run + HIP_CHECK(rocprim::find_first_of(d_temp_storage, + temp_storage_size_bytes, + input_it, + keys_it, + d_output, + input.size(), + keys.size(), + compare_op, + stream, + debug_synchronous)); + + hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) + { + graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + } + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + output_type output; + + // Copy output to host + HIP_CHECK(hipMemcpy(&output, d_output, sizeof(*d_output), hipMemcpyDeviceToHost)); + + // Check + auto expected = std::find_first_of(input.begin(), + input.end(), + keys.begin(), + keys.end(), + compare_op) + - input.begin(); + + ASSERT_EQ(output, expected); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_keys)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temp_storage)); + + if(TestFixture::use_graphs) + { + test_utils::cleanupGraphHelper(graph, graph_instance); + HIP_CHECK(hipStreamDestroy(stream)); + } + } + } + } +} + +TEST(RocprimDeviceFindFirstOfTests, LargeIndices) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using output_type = size_t; + using config = rocprim::default_config; + + constexpr bool debug_synchronous = false; + + for(size_t size : test_utils::get_large_sizes(seeds[0])) + { + SCOPED_TRACE(testing::Message() << "with size = " << size); + + const size_t keys_size = 12; + + for(double starting_point : {0.0, 0.12, 0.78, 1.1}) + { + SCOPED_TRACE(testing::Message() << "with starting_point = " << starting_point); + + hipStream_t stream = 0; // default + + output_type* d_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, sizeof(*d_output))); + + const output_type expected + = std::min(size, static_cast(starting_point * size)); + + auto input_it = rocprim::make_counting_iterator(size_t(0)); + auto keys_it = rocprim::make_counting_iterator(expected); + + rocprim::equal_to compare_op; + + // temp storage + size_t temp_storage_size_bytes; + void* d_temp_storage = nullptr; + // Get size of d_temp_storage + HIP_CHECK(rocprim::find_first_of(d_temp_storage, + temp_storage_size_bytes, + input_it, + keys_it, + d_output, + size, + keys_size, + compare_op, + stream, + debug_synchronous)); + + // temp_storage_size_bytes must be >0 + ASSERT_GT(temp_storage_size_bytes, 0); + + // allocate temporary storage + HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); + + // Run + HIP_CHECK(rocprim::find_first_of(d_temp_storage, + temp_storage_size_bytes, + input_it, + keys_it, + d_output, + size, + keys_size, + compare_op, + stream, + debug_synchronous)); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Copy output to host and check + output_type output; + HIP_CHECK(hipMemcpy(&output, d_output, sizeof(*d_output), hipMemcpyDeviceToHost)); + ASSERT_EQ(output, expected); + + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temp_storage)); + } + } +} diff --git a/test/rocprim/test_utils_data_generation.hpp b/test/rocprim/test_utils_data_generation.hpp index d45d9acef..6358dffbe 100644 --- a/test/rocprim/test_utils_data_generation.hpp +++ b/test/rocprim/test_utils_data_generation.hpp @@ -265,6 +265,10 @@ inline OutputIter segmented_generate_n(OutputIter it, size_t size, Generator&& g std::generate_n(it + segment_size * segment_index, segment_size, gen); } } + // Generate the remaining items + std::generate_n(it + segment_size * random_data_generation_segments, + size - segment_size * random_data_generation_segments, + gen); return it + size; }