From 3507b6392f41d95fc5780b8ce80a45b0539db6ea Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Thu, 8 Aug 2024 14:11:54 +0500 Subject: [PATCH 1/8] Remove hcc path from masked_bit_count --- rocprim/include/rocprim/intrinsics/warp.hpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/rocprim/include/rocprim/intrinsics/warp.hpp b/rocprim/include/rocprim/intrinsics/warp.hpp index 5da311dc5..66d2e359c 100644 --- a/rocprim/include/rocprim/intrinsics/warp.hpp +++ b/rocprim/include/rocprim/intrinsics/warp.hpp @@ -52,19 +52,10 @@ unsigned int masked_bit_count(lane_mask_type x, unsigned int add = 0) int c; #ifndef __HIP_CPU_RT__ #if ROCPRIM_WAVEFRONT_SIZE == 32 - #ifdef __HIP__ c = ::__builtin_amdgcn_mbcnt_lo(x, add); - #else - c = ::__mbcnt_lo(x, add); - #endif #else - #ifdef __HIP__ c = ::__builtin_amdgcn_mbcnt_lo(static_cast(x), add); c = ::__builtin_amdgcn_mbcnt_hi(static_cast(x >> 32), c); - #else - c = ::__mbcnt_lo(static_cast(x), add); - c = ::__mbcnt_hi(static_cast(x >> 32), c); - #endif #endif #else using namespace hip::detail; From 2b0434cf3ed49884483d52f707dcf5282eb7b85d Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Wed, 21 Aug 2024 16:01:36 +0500 Subject: [PATCH 2/8] Fix data generation in tests: tailing items may be uninitialized --- test/rocprim/test_utils_data_generation.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/rocprim/test_utils_data_generation.hpp b/test/rocprim/test_utils_data_generation.hpp index d45d9acef..6358dffbe 100644 --- a/test/rocprim/test_utils_data_generation.hpp +++ b/test/rocprim/test_utils_data_generation.hpp @@ -265,6 +265,10 @@ inline OutputIter segmented_generate_n(OutputIter it, size_t size, Generator&& g std::generate_n(it + segment_size * segment_index, segment_size, gen); } } + // Generate the remaining items + std::generate_n(it + segment_size * random_data_generation_segments, + size - segment_size * random_data_generation_segments, + gen); return it + size; } From a6922c9d2d9de0b3698290a9a6a164e3a1fad59b Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Mon, 12 Aug 2024 16:03:52 +0500 Subject: [PATCH 3/8] find_first_of: Add a naive implementation --- benchmark/CMakeLists.txt | 1 + benchmark/benchmark_device_find_first_of.cpp | 125 ++++++++ benchmark/benchmark_device_find_first_of.hpp | 176 +++++++++++ .../rocprim/device/device_find_first_of.hpp | 121 +++++++ test/rocprim/CMakeLists.txt | 2 +- test/rocprim/test_device_find_first_of.cpp | 297 ++++++++++++++++++ 6 files changed, 721 insertions(+), 1 deletion(-) create mode 100644 benchmark/benchmark_device_find_first_of.cpp create mode 100644 benchmark/benchmark_device_find_first_of.hpp create mode 100644 rocprim/include/rocprim/device/device_find_first_of.hpp create mode 100644 test/rocprim/test_device_find_first_of.cpp diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 03cada66c..cda46e4a3 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -136,6 +136,7 @@ add_rocprim_benchmark(benchmark_config_dispatch.cpp) add_rocprim_benchmark(benchmark_device_adjacent_difference.cpp) add_rocprim_benchmark(benchmark_device_batch_memcpy.cpp) add_rocprim_benchmark(benchmark_device_binary_search.cpp) +add_rocprim_benchmark(benchmark_device_find_first_of.cpp) add_rocprim_benchmark(benchmark_device_histogram.cpp) add_rocprim_benchmark(benchmark_device_merge.cpp) add_rocprim_benchmark(benchmark_device_merge_sort.cpp) diff --git a/benchmark/benchmark_device_find_first_of.cpp b/benchmark/benchmark_device_find_first_of.cpp new file mode 100644 index 000000000..67606e9fa --- /dev/null +++ b/benchmark/benchmark_device_find_first_of.cpp @@ -0,0 +1,125 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "benchmark_device_find_first_of.hpp" +#include "benchmark_utils.hpp" + +// CmdParser +#include "cmdparser.hpp" + +// Google Benchmark +#include + +// HIP API +#include + +#include +#include + +#ifndef DEFAULT_N +const size_t DEFAULT_N = 1024 * 1024 * 32; +#endif + +#define CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, FIRST_OCCURENCE) \ + { \ + const device_find_first_of_benchmark instance(KEYS_SIZE, FIRST_OCCURENCE); \ + REGISTER_BENCHMARK(benchmarks, size, seed, stream, instance); \ + } + +// clang-format off +#define CREATE_BENCHMARK0(TYPE, KEYS_SIZE) \ + { \ + CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, 0.1) \ + CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, 0.5) \ + CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, 1.0) \ + } + +#define CREATE_BENCHMARK(TYPE) \ + { \ + CREATE_BENCHMARK0(TYPE, 10) \ + CREATE_BENCHMARK0(TYPE, 128) \ + CREATE_BENCHMARK0(TYPE, 1024) \ + CREATE_BENCHMARK0(TYPE, 10000) \ + } +// clang-format on + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.set_optional("name_format", + "name_format", + "human", + "either: json,human,txt"); + parser.set_optional("seed", "seed", "random", get_seed_message()); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + bench_naming::set_format(parser.get("name_format")); + const std::string seed_type = parser.get("seed"); + const managed_seed seed(seed_type); + + // HIP + hipStream_t stream = 0; // default + + // Benchmark info + add_common_benchmark_info(); + benchmark::AddCustomContext("size", std::to_string(size)); + benchmark::AddCustomContext("seed", seed_type); + + // Add benchmarks + std::vector benchmarks{}; + CREATE_BENCHMARK(int8_t) + CREATE_BENCHMARK(int16_t) + CREATE_BENCHMARK(int32_t) + CREATE_BENCHMARK(int64_t) + + using custom_int2 = custom_type; + using custom_longlong_double = custom_type; + + CREATE_BENCHMARK(custom_int2) + CREATE_BENCHMARK(custom_longlong_double) + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } + } + + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; +} diff --git a/benchmark/benchmark_device_find_first_of.hpp b/benchmark/benchmark_device_find_first_of.hpp new file mode 100644 index 000000000..970f911ec --- /dev/null +++ b/benchmark/benchmark_device_find_first_of.hpp @@ -0,0 +1,176 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_HPP_ +#define ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_HPP_ + +#include "benchmark_utils.hpp" + +// Google Benchmark +#include + +// HIP API +#include + +// rocPRIM +#include + +#include +#include +#include + +template +struct device_find_first_of_benchmark : public config_autotune_interface +{ + size_t keys_size; + double first_occurence; + + device_find_first_of_benchmark(size_t keys_size, double first_occurence) + : keys_size(keys_size), first_occurence(first_occurence) + {} + + std::string name() const override + { + using namespace std::string_literals; + return bench_naming::format_name( + "{lvl:device,algo:find_first_of,keys_size:" + std::to_string(keys_size) + + ",first_occurence:" + std::to_string(first_occurence) + + ",key_type:" + std::string(Traits::name()) + ",cfg:default_config}"); + } + + static constexpr unsigned int batch_size = 10; + static constexpr unsigned int warmup_size = 5; + + void run(benchmark::State& state, + size_t size, + const managed_seed& seed, + hipStream_t stream) const override + { + using type = Key; + using key_type = Key; + using output_type = size_t; + + // Generate data + std::vector key_input + = get_random_data(keys_size, 0, 100, seed.get_0()); + std::vector input + = get_random_data(size, 101, generate_limits::max(), seed.get_0()); + + // Set the first occurence of keys in input + const size_t p = static_cast(size * first_occurence); + if(p < size) + { + input[p] = key_input[keys_size / 2]; + } + + type* d_input; + key_type* d_key_input; + output_type* d_output; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); + HIP_CHECK(hipMalloc(&d_key_input, size * sizeof(*d_key_input))); + HIP_CHECK(hipMalloc(&d_output, sizeof(*d_output))); + + HIP_CHECK(hipMemcpy(d_input, + input.data(), + input.size() * sizeof(*d_input), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_key_input, + key_input.data(), + key_input.size() * sizeof(*d_key_input), + hipMemcpyHostToDevice)); + + ::rocprim::equal_to compare_op; + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + + auto run = [&]() + { + HIP_CHECK(rocprim::find_first_of(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_key_input, + d_output, + input.size(), + key_input.size(), + compare_op, + stream)); + }; + + run(); + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + run(); + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + for(auto _ : state) + { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + for(size_t i = 0; i < batch_size; i++) + { + run(); + } + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + + // Destroy HIP events + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); + + // Only a part of data (before the first occurence) must be actually processed + const size_t effective_size = static_cast(size * first_occurence); + state.SetBytesProcessed(state.iterations() * batch_size * effective_size + * sizeof(*d_input)); + state.SetItemsProcessed(state.iterations() * batch_size * effective_size); + // All threads of all blocks read the same keys so this value is limited by cache bandwidth + state.counters["bytes_per_second_keys"] = benchmark::Counter( + static_cast(state.iterations() * batch_size * effective_size * keys_size + * sizeof(*d_key_input)), + benchmark::Counter::kIsRate, + benchmark::Counter::kIs1024); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_key_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temporary_storage)); + } +}; + +#endif // ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_HPP_ diff --git a/rocprim/include/rocprim/device/device_find_first_of.hpp b/rocprim/include/rocprim/device/device_find_first_of.hpp new file mode 100644 index 000000000..be39fe585 --- /dev/null +++ b/rocprim/include/rocprim/device/device_find_first_of.hpp @@ -0,0 +1,121 @@ +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_HPP_ +#define ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_HPP_ + +#include "../detail/temp_storage.hpp" + +#include "../config.hpp" + +#include "../iterator/transform_iterator.hpp" +#include "../iterator/zip_iterator.hpp" +#include "../iterator/counting_iterator.hpp" +#include "config_types.hpp" +#include "device_reduce.hpp" + +#include +#include + +#include +#include + +BEGIN_ROCPRIM_NAMESPACE + +namespace detail +{ + +template +ROCPRIM_INLINE +hipError_t find_first_of_impl(void* temporary_storage, + size_t& storage_size, + InputIterator1 input, + InputIterator2 keys, + OutputIterator output, + size_t size, + size_t keys_size, + BinaryFunction compare_function, + hipStream_t stream, + bool debug_synchronous) +{ + return reduce( + temporary_storage, storage_size, + rocprim::make_transform_iterator( + rocprim::make_zip_iterator(rocprim::make_tuple(rocprim::make_counting_iterator(0), input)), + [keys, keys_size, size, compare_function] ROCPRIM_DEVICE (const auto& index_value) + { + for(size_t i = 0; i < keys_size; ++i) + { + if(compare_function(keys[i], get<1>(index_value))) + { + return get<0>(index_value); + } + } + return size; + } + ), + output, + size, + rocprim::minimum(), + stream, + debug_synchronous + ); +} + +} // namespace detail + +/// \addtogroup devicemodule +/// @{ + +template::value_type>> +ROCPRIM_INLINE +hipError_t find_first_of(void* temporary_storage, + size_t& storage_size, + InputIterator1 input, + InputIterator2 keys, + OutputIterator output, + size_t size, + size_t keys_size, + BinaryFunction compare_function = BinaryFunction(), + hipStream_t stream = 0, + bool debug_synchronous = false) +{ + return detail::find_first_of_impl( + temporary_storage, storage_size, + input, keys, output, size, keys_size, compare_function, + stream, debug_synchronous + ); +} + +/// @} +// end of group devicemodule + +END_ROCPRIM_NAMESPACE + +#endif // ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_HPP_ diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt index 9d7f9a4b2..5307f6db6 100644 --- a/test/rocprim/CMakeLists.txt +++ b/test/rocprim/CMakeLists.txt @@ -258,6 +258,7 @@ add_rocprim_test("rocprim.constant_iterator" test_constant_iterator.cpp) add_rocprim_test("rocprim.counting_iterator" test_counting_iterator.cpp) add_rocprim_test("rocprim.device_batch_memcpy" test_device_batch_memcpy.cpp) add_rocprim_test("rocprim.device_binary_search" test_device_binary_search.cpp) +add_rocprim_test("rocprim.device_find_first_of" test_device_find_first_of.cpp) add_rocprim_test("rocprim.device_adjacent_difference" test_device_adjacent_difference.cpp) add_rocprim_test("rocprim.device_histogram" test_device_histogram.cpp) add_rocprim_test("rocprim.device_merge" test_device_merge.cpp) @@ -296,4 +297,3 @@ add_rocprim_test("rocprim.warp_scan" test_warp_scan.cpp) add_rocprim_test("rocprim.warp_sort" test_warp_sort.cpp) add_rocprim_test("rocprim.warp_store" test_warp_store.cpp) add_rocprim_test("rocprim.zip_iterator" test_zip_iterator.cpp) - diff --git a/test/rocprim/test_device_find_first_of.cpp b/test/rocprim/test_device_find_first_of.cpp new file mode 100644 index 000000000..78c353e6a --- /dev/null +++ b/test/rocprim/test_device_find_first_of.cpp @@ -0,0 +1,297 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// required test headers +#include "indirect_iterator.hpp" +#include "test_utils_assertions.hpp" +#include "test_utils_custom_float_type.hpp" +#include "test_utils_custom_test_types.hpp" +#include "test_utils_data_generation.hpp" +#include "test_utils_types.hpp" + +#include "../common_test_header.hpp" + +// required rocprim headers +#include +// #include +#include +#include + +#include +#include +#include +#include + +#include +#include + +// Params for tests +template, + class Config = rocprim::default_config, + bool UseGraphs = false, + bool UseIndirectIterator = false> +struct DeviceFindFirstOfParams +{ + using type = Type; + using key_type = KeyType; + using compare_function = CompareFunction; + using config = Config; + static constexpr bool use_graphs = UseGraphs; + static constexpr bool use_indirect_iterator = UseIndirectIterator; +}; + +// std::find_first_of is available since C++17 +template +InputIt + find_first_of_(InputIt first, InputIt last, ForwardIt s_first, ForwardIt s_last, BinaryPred p) +{ + for(; first != last; ++first) + { + for(ForwardIt it = s_first; it != s_last; ++it) + { + if(p(*first, *it)) + { + return first; + } + } + } + return last; +} + +template +class RocprimDeviceFindFirstOfTests : public ::testing::Test +{ +public: + using type = typename Params::type; + using key_type = typename Params::key_type; + using compare_function = typename Params::compare_function; + using config = typename Params::config; + const bool debug_synchronous = false; + static constexpr bool use_graphs = Params::use_graphs; + static constexpr bool use_indirect_iterator = Params::use_indirect_iterator; +}; + +using RocprimDeviceFindFirstOfTestsParams = ::testing::Types, + DeviceFindFirstOfParams, + DeviceFindFirstOfParams>; + +TYPED_TEST_SUITE(RocprimDeviceFindFirstOfTests, RocprimDeviceFindFirstOfTestsParams); + +TYPED_TEST(RocprimDeviceFindFirstOfTests, FindFirstOf) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using type = typename TestFixture::type; + using key_type = typename TestFixture::key_type; + using output_type = size_t; + using compare_function = typename TestFixture::compare_function; + using config = typename TestFixture::config; + const bool debug_synchronous = TestFixture::debug_synchronous; + constexpr bool use_indirect_iterator = TestFixture::use_indirect_iterator; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); + + std::cout << "with seed = " << seed_value << std::endl; + + for(size_t size : test_utils::get_sizes(seed_value)) + { + + const size_t keys_size + = std::sqrt(test_utils::get_random_value(0, size, seed_value)); + + for(double starting_point : {0.0, 0.2, 0.8, 1.0}) + { + std::cout << "with size = " << size << ", keys_size = " << keys_size + << ", starting_point = " << starting_point << std::endl; + + hipStream_t stream = 0; // default + if(TestFixture::use_graphs) + { + // Default stream does not support hipGraph stream capture, so create one + HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + } + + SCOPED_TRACE(testing::Message() << "with size = " << size); + + // Generate data + std::vector input; + std::vector key_input; + if ROCPRIM_IF_CONSTEXPR(rocprim::is_floating_point::value) + { + // input = test_utils::get_random_data(size, -1000, 1000, seed_value); + // key_input + // = test_utils::get_random_data(keys_size, -1000, 1000, seed_value); + } + else + { + key_input = test_utils::get_random_data( + keys_size, + 0, + 10, + // test_utils::numeric_limits::min(), + // test_utils::numeric_limits::max(), + seed_value); + input = test_utils::get_random_data( + size, + 0, + 1000, + // test_utils::numeric_limits::min(), + // test_utils::numeric_limits::max(), + seed_value); + + if(size > 0 && keys_size > 0) + { + // Change the input range before starting_point to ensure that it does not contain + // any values from keys + auto minmax_key_input + = std::minmax_element(key_input.begin(), key_input.end()); + const auto min_key_input = *minmax_key_input.first; + const auto max_key_input = *minmax_key_input.second; + const auto max_input + = *std::minmax_element(input.begin(), input.end()).second; + // std::cout << "min_key_input = " << min_key_input << ", max_key_input = " << max_key_input << ", max_input = " << max_input << std::endl; + for(size_t i = 0; i < size * starting_point; ++i) + { + if(min_key_input <= input[i] && input[i] <= max_key_input) + { + input[i] = max_input; + } + } + } + } + + type* d_input; + key_type* d_key_input; + output_type* d_output; + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(*d_input))); + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_key_input, + key_input.size() * sizeof(*d_key_input))); + HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, sizeof(*d_output))); + + HIP_CHECK(hipMemcpy(d_input, + input.data(), + input.size() * sizeof(*d_input), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_key_input, + key_input.data(), + key_input.size() * sizeof(*d_key_input), + hipMemcpyHostToDevice)); + + const auto input_it + = test_utils::wrap_in_indirect_iterator(d_input); + const auto key_input_it + = test_utils::wrap_in_indirect_iterator(d_key_input); + + // compare function + compare_function compare_op; + + // temp storage + size_t temp_storage_size_bytes; + void* d_temp_storage = nullptr; + // Get size of d_temp_storage + HIP_CHECK(rocprim::find_first_of(d_temp_storage, + temp_storage_size_bytes, + input_it, + key_input_it, + d_output, + input.size(), + key_input.size(), + compare_op, + stream, + debug_synchronous)); + + // temp_storage_size_bytes must be >0 + ASSERT_GT(temp_storage_size_bytes, 0); + + // allocate temporary storage + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); + + hipGraph_t graph; + if(TestFixture::use_graphs) + { + graph = test_utils::createGraphHelper(stream); + } + + // Run + HIP_CHECK(rocprim::find_first_of(d_temp_storage, + temp_storage_size_bytes, + input_it, + key_input_it, + d_output, + input.size(), + key_input.size(), + compare_op, + stream, + debug_synchronous)); + + hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) + { + graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + } + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + output_type output; + + // Copy output to host + HIP_CHECK(hipMemcpy(&output, d_output, sizeof(*d_output), hipMemcpyDeviceToHost)); + + // Check + auto expected = find_first_of_(input.begin(), + input.end(), + key_input.begin(), + key_input.end(), + compare_op) + - input.begin(); + + std::cout << "expected = " << expected << ", output = " << output << ", " + << double(expected) / double(size) << std::endl; + + ASSERT_EQ(output, expected); + + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_key_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temp_storage)); + + if(TestFixture::use_graphs) + { + test_utils::cleanupGraphHelper(graph, graph_instance); + HIP_CHECK(hipStreamDestroy(stream)); + } + } + } + } +} From f4612169074a1841624669b0f9e2dbbf9982826d Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Wed, 21 Aug 2024 18:07:44 +0500 Subject: [PATCH 4/8] find_first_of: Add a specialized kernel with early-exit --- .../device/detail/device_config_helper.hpp | 26 ++ .../rocprim/device/device_find_first_of.hpp | 323 ++++++++++++++++-- .../device/device_find_first_of_config.hpp | 77 +++++ rocprim/include/rocprim/intrinsics/atomic.hpp | 20 +- 4 files changed, 408 insertions(+), 38 deletions(-) create mode 100644 rocprim/include/rocprim/device/device_find_first_of_config.hpp diff --git a/rocprim/include/rocprim/device/detail/device_config_helper.hpp b/rocprim/include/rocprim/device/detail/device_config_helper.hpp index 74374f011..c539704c5 100644 --- a/rocprim/include/rocprim/device/detail/device_config_helper.hpp +++ b/rocprim/include/rocprim/device/detail/device_config_helper.hpp @@ -1082,6 +1082,32 @@ struct nth_element_config : public detail::nth_element_config_params #endif }; +namespace detail +{ + +struct find_first_of_config_params +{ + kernel_config_params kernel_config; +}; + +} // namespace detail + +/// \brief Configuration of device-level find_first_of +/// +/// \tparam BlockSize number of threads in a block. +/// \tparam ItemsPerThread number of items processed by each thread. +template +struct find_first_of_config : public detail::find_first_of_config_params +{ +#ifndef DOXYGEN_SHOULD_SKIP_THIS + constexpr find_first_of_config() + : detail::find_first_of_config_params{ + {BlockSize, ItemsPerThread, ROCPRIM_GRID_SIZE_LIMIT} + } + {} +#endif +}; + END_ROCPRIM_NAMESPACE /// @} diff --git a/rocprim/include/rocprim/device/device_find_first_of.hpp b/rocprim/include/rocprim/device/device_find_first_of.hpp index be39fe585..320382f3b 100644 --- a/rocprim/include/rocprim/device/device_find_first_of.hpp +++ b/rocprim/include/rocprim/device/device_find_first_of.hpp @@ -21,27 +21,154 @@ #ifndef ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_HPP_ #define ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_HPP_ -#include "../detail/temp_storage.hpp" - #include "../config.hpp" - -#include "../iterator/transform_iterator.hpp" -#include "../iterator/zip_iterator.hpp" -#include "../iterator/counting_iterator.hpp" +#include "../detail/temp_storage.hpp" #include "config_types.hpp" -#include "device_reduce.hpp" - -#include -#include +#include "device_find_first_of_config.hpp" +#include "device_transform.hpp" +#include #include #include +#include +#include BEGIN_ROCPRIM_NAMESPACE namespace detail { +#define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ + do \ + { \ + hipError_t _error = hipGetLastError(); \ + if(_error != hipSuccess) \ + return _error; \ + if(debug_synchronous) \ + { \ + std::cout << name << "(" << size << ")"; \ + hipError_t __error = hipStreamSynchronize(stream); \ + if(__error != hipSuccess) \ + return __error; \ + auto _end = std::chrono::steady_clock::now(); \ + auto _d = std::chrono::duration_cast>(_end - start); \ + std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ + } \ + } \ + while(0) + +#define RETURN_ON_ERROR(...) \ + do \ + { \ + hipError_t error = (__VA_ARGS__); \ + if(error != hipSuccess) \ + { \ + return error; \ + } \ + } \ + while(0) + +template +ROCPRIM_KERNEL __launch_bounds__(device_params().kernel_config.block_size) +void find_first_of_kernel(InputIterator1 input, + InputIterator2 keys, + size_t* output, + size_t size, + size_t keys_size, + BinaryFunction compare_function) +{ + constexpr find_first_of_config_params params = device_params(); + + constexpr unsigned int block_size = params.kernel_config.block_size; + constexpr unsigned int items_per_thread = params.kernel_config.items_per_thread; + constexpr unsigned int items_per_block = block_size * items_per_thread; + constexpr unsigned int identity = std::numeric_limits::max(); + + using type = typename std::iterator_traits::value_type; + using key_type = typename std::iterator_traits::value_type; + + const unsigned int thread_id = ::rocprim::detail::block_thread_id<0>(); + const unsigned int block_id = ::rocprim::detail::block_id<0>(); + const unsigned int number_of_blocks = ::rocprim::detail::grid_size<0>(); + + ROCPRIM_SHARED_MEMORY struct + { + unsigned int block_first_index; + size_t grid_first_index; + } storage; + + if(thread_id == 0) + { + storage.block_first_index = identity; + } + syncthreads(); + + size_t block_offset = block_id * items_per_block; + for(; block_offset < size; block_offset += number_of_blocks * items_per_block) + { + if(thread_id == 0) + { + storage.grid_first_index = atomic_load(output); + } + syncthreads(); + if(storage.grid_first_index < block_offset) + { + // No need to continue if one of previous blocks (or this one) has found a match + break; + } + + type items[items_per_thread]; + + unsigned int thread_first_index = identity; + + if(block_offset + items_per_block <= size) + { + block_load_direct_striped(thread_id, input + block_offset, items); + for(size_t key_index = 0; key_index < keys_size; ++key_index) + { + const key_type key = keys[key_index]; + ROCPRIM_UNROLL + for(unsigned int i = 0; i < items_per_thread; ++i) + { + if(compare_function(key, items[i])) + { + thread_first_index = min(thread_first_index, i); + } + } + } + } + else + { + const unsigned int valid = size - block_offset; + block_load_direct_striped(thread_id, input + block_offset, items, valid); + for(size_t key_index = 0; key_index < keys_size; ++key_index) + { + const key_type key = keys[key_index]; + ROCPRIM_UNROLL + for(unsigned int i = 0; i < items_per_thread; ++i) + { + if(i * block_size + thread_id < valid && compare_function(key, items[i])) + { + thread_first_index = min(thread_first_index, i); + } + } + } + } + + if(thread_first_index != identity) + { + // This happens to some blocks rarely so it is not beneficial to avoid atomic conflicts + // with block_reduce which needs to be computed even if no threads have a match. + atomic_min(&storage.block_first_index, thread_first_index * block_size + thread_id); + } + syncthreads(); + if(thread_id == 0 && storage.block_first_index != identity) + { + atomic_min(output, block_offset + storage.block_first_index); + } + } +} + template( - temporary_storage, storage_size, - rocprim::make_transform_iterator( - rocprim::make_zip_iterator(rocprim::make_tuple(rocprim::make_counting_iterator(0), input)), - [keys, keys_size, size, compare_function] ROCPRIM_DEVICE (const auto& index_value) - { - for(size_t i = 0; i < keys_size; ++i) - { - if(compare_function(keys[i], get<1>(index_value))) - { - return get<0>(index_value); - } - } - return size; - } - ), - output, - size, - rocprim::minimum(), - stream, - debug_synchronous - ); + using type = typename std::iterator_traits::value_type; + using config = wrapped_find_first_of_config; + + target_arch target_arch; + hipError_t result = host_target_arch(stream, target_arch); + if(result != hipSuccess) + { + return result; + } + const find_first_of_config_params params = dispatch_target_arch(target_arch); + + const unsigned int block_size = params.kernel_config.block_size; + const unsigned int items_per_thread = params.kernel_config.items_per_thread; + const unsigned int items_per_block = block_size * items_per_thread; + + if(temporary_storage == nullptr) + { + storage_size = sizeof(size_t); + return hipSuccess; + } + + size_t* tmp_output = reinterpret_cast(temporary_storage); + + RETURN_ON_ERROR( + hipMemcpyAsync(tmp_output, &size, sizeof(*tmp_output), hipMemcpyHostToDevice, stream)); + + if(size > 0) + { + std::chrono::steady_clock::time_point start; + if(debug_synchronous) + { + start = std::chrono::steady_clock::now(); + } + + const size_t shared_memory_size = 0; + auto kernel = find_first_of_kernel; + + // Choose minimum grid size needed to achieve the highest occupancy + int min_grid_size, max_block_size; + RETURN_ON_ERROR(hipOccupancyMaxPotentialBlockSize(&min_grid_size, + &max_block_size, + kernel, + shared_memory_size, + int(block_size))); + const size_t num_blocks + = std::min(size_t(min_grid_size), ceiling_div(size, items_per_block)); + + kernel<<>>(input, + keys, + tmp_output, + size, + keys_size, + compare_function); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("find_first_of_kernel", size, start); + } + + RETURN_ON_ERROR( + transform(tmp_output, output, 1, ::rocprim::identity(), stream, debug_synchronous)); + + return hipSuccess; } } // namespace detail @@ -88,6 +253,85 @@ hipError_t find_first_of_impl(void* temporary_storage, /// \addtogroup devicemodule /// @{ +/// \brief Searches the range [input, input + size) for any of the elements in the range +/// [keys, keys + keys_size). +/// +/// \par Overview +/// * The contents of the inputs are not altered by the function. +/// * Returns the required size of `temporary_storage` in `storage_size` if `temporary_storage` is +// a null pointer. +/// * Accepts custom compare_function. +/// +/// \tparam Config [optional] configuration of the primitive. It has to be `find_first_of_config`. +/// \tparam InputIterator1 [inferred] random-access iterator type of the input range. Must meet the +/// requirements of a C++ InputIterator concept. It can be a simple pointer type. +/// \tparam InputIterator2 [inferred] random-access iterator type of the input range. Must meet the +/// requirements of a C++ InputIterator concept. It can be a simple pointer type. +/// \tparam OutputIterator [inferred] random-access iterator type of the output range. Must meet +/// the requirements of a C++ InputIterator concept. It can be a simple pointer type. +/// \tparam CompareFunction [inferred] Type of binary function that accepts two arguments of the +/// type `InputIterator1` and returns a value convertible to bool. Default type is +/// `::rocprim::equal_to<>.` +/// +/// \param [in] temporary_storage pointer to a device-accessible temporary storage. When +/// a null pointer is passed, the required allocation size (in bytes) is written to +/// `storage_size` and function returns without performing the search. +/// \param [in,out] storage_size reference to a size (in bytes) of `temporary_storage`. +/// \param [in] input iterator to the range of elements to examine. +/// \param [in] keys iterator to the range of elements to search for. +/// \param [out] output iterator to the output range. `output` should be able to be written for 1 +/// element. `*output` constains the position of the first element in the range +/// [input, input + size) that is equal to an element from the range [keys, keys + keys_size). +// If no such element is found, `*output` contains `size`. +/// \param [in] size number of elements to examine. +/// \param [in] keys_size number of elements to search for. +/// \param [in] compare_function binary operation function object that will be used for comparison. +/// The signature of the function should be equivalent to the following: +/// bool f(const T &a, const T &b);. The signature does not need to have +/// const &, but function object must not modify the objects passed to it. +/// The comparator must meet the C++ named requirement Compare. +/// The default value is `BinaryFunction()`. +/// \param [in] stream [optional] HIP stream object. Default is `0` (default stream). +/// \param [in] debug_synchronous [optional] If true, synchronization after every kernel +/// launch is forced in order to check for errors. Default value is `false`. +/// +/// \returns `hipSuccess` (`0`) after successful search; otherwise a HIP runtime error of +/// type `hipError_t`. +/// +/// \par Example +/// \parblock +/// In this example a device-level find_first_of is performed where inputs and keys are +/// represented by an array of unsigned integers. +/// +/// \code{.cpp} +/// #include +/// +/// // Prepare input and output (declare pointers, allocate device memory etc.) +/// size_t size; // e.g., 8 +/// size_t keys_size; // e.g., 2 +/// unsigned int* input; // e.g., [ 6, 3, 5, 4, 1, 8, 2, 7 ] +/// unsigned int* keys; // e.g., [ 10, 5 ] +/// unsigned int* keys_output; // 1 element +/// +/// size_t temporary_storage_size_bytes; +/// void * temporary_storage_ptr = nullptr; +/// // Get required size of the temporary storage +/// rocprim::find_first_of( +/// temporary_storage_ptr, temporary_storage_size_bytes, +/// input, keys, output, size, keys_size +/// ); +/// +/// // allocate temporary storage +/// hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes); +/// +/// // perform find_first_of +/// rocprim::find_first_of( +/// temporary_storage_ptr, temporary_storage_size_bytes, +/// input, keys, output, size, keys_size +/// ); +/// // possible output: [ 2 ] +/// \endcode +/// \endparblock template( - temporary_storage, storage_size, - input, keys, output, size, keys_size, compare_function, - stream, debug_synchronous - ); + return detail::find_first_of_impl(temporary_storage, + storage_size, + input, + keys, + output, + size, + keys_size, + compare_function, + stream, + debug_synchronous); } /// @} diff --git a/rocprim/include/rocprim/device/device_find_first_of_config.hpp b/rocprim/include/rocprim/device/device_find_first_of_config.hpp new file mode 100644 index 000000000..7996329f5 --- /dev/null +++ b/rocprim/include/rocprim/device/device_find_first_of_config.hpp @@ -0,0 +1,77 @@ +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_CONFIG_HPP_ +#define ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_CONFIG_HPP_ + +#include "config_types.hpp" + +#include "detail/device_config_helper.hpp" + +/// \addtogroup primitivesmodule_deviceconfigs +/// @{ + +BEGIN_ROCPRIM_NAMESPACE + +namespace detail +{ + +// generic struct that instantiates custom configurations +template +struct wrapped_find_first_of_config +{ + template + struct architecture_config + { + static constexpr find_first_of_config_params params = Config{}; + }; +}; + +// specialized for rocprim::default_config, which instantiates the default_find_first_of_config +template +struct wrapped_find_first_of_config +{ + template + struct architecture_config + { + static constexpr find_first_of_config_params params = {kernel_config<256, 4>()}; + }; +}; + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +template +template +constexpr find_first_of_config_params + wrapped_find_first_of_config::architecture_config::params; + +template +template +constexpr find_first_of_config_params + wrapped_find_first_of_config::architecture_config::params; +#endif // DOXYGEN_SHOULD_SKIP_THIS + +} // namespace detail + +END_ROCPRIM_NAMESPACE + +/// @} +// end of group primitivesmodule_deviceconfigs + +#endif // ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_CONFIG_HPP_ diff --git a/rocprim/include/rocprim/intrinsics/atomic.hpp b/rocprim/include/rocprim/intrinsics/atomic.hpp index f9a8aa084..f0daea443 100644 --- a/rocprim/include/rocprim/intrinsics/atomic.hpp +++ b/rocprim/include/rocprim/intrinsics/atomic.hpp @@ -58,7 +58,25 @@ namespace detail } ROCPRIM_DEVICE ROCPRIM_INLINE - unsigned int atomic_wrapinc(unsigned int * address, unsigned int value) + unsigned int atomic_min(unsigned int* address, unsigned int value) + { + return ::atomicMin(address, value); + } + + ROCPRIM_DEVICE ROCPRIM_INLINE + unsigned long atomic_min(unsigned long* address, unsigned long value) + { + return ::atomicMin(address, value); + } + + ROCPRIM_DEVICE ROCPRIM_INLINE + unsigned long long atomic_min(unsigned long long* address, unsigned long long value) + { + return ::atomicMin(address, value); + } + + ROCPRIM_DEVICE ROCPRIM_INLINE + unsigned int atomic_wrapinc(unsigned int* address, unsigned int value) { return ::atomicInc(address, value); } From f6abaa78ab91178fdbae5539d545356f060b2a66 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Wed, 21 Aug 2024 18:09:17 +0500 Subject: [PATCH 5/8] find_first_of: Extend tests, add a test for large indices --- test/rocprim/identity_iterator.hpp | 4 +- test/rocprim/indirect_iterator.hpp | 2 +- test/rocprim/test_device_find_first_of.cpp | 277 ++++++++++++++------- 3 files changed, 189 insertions(+), 94 deletions(-) diff --git a/test/rocprim/identity_iterator.hpp b/test/rocprim/identity_iterator.hpp index f5f6c9a3d..5219c1e59 100644 --- a/test/rocprim/identity_iterator.hpp +++ b/test/rocprim/identity_iterator.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2020 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -24,7 +24,7 @@ namespace test_utils { -// Output iterator used in tests to check situtations when +// Output iterator used in tests to check situations when // value_type of output iterator is void template class identity_iterator diff --git a/test/rocprim/indirect_iterator.hpp b/test/rocprim/indirect_iterator.hpp index 88e29b5fb..17688dae4 100644 --- a/test/rocprim/indirect_iterator.hpp +++ b/test/rocprim/indirect_iterator.hpp @@ -63,7 +63,7 @@ class reference_wrapper T* _ptr; }; -// Iterator used in tests to check situtations when value_type of the +// Iterator used in tests to check situations when value_type of the // iterator is not the same as the return type of operator[]. // It is a simplified version of device_vector::iterator from thrust. template diff --git a/test/rocprim/test_device_find_first_of.cpp b/test/rocprim/test_device_find_first_of.cpp index 78c353e6a..ca3c7b9ce 100644 --- a/test/rocprim/test_device_find_first_of.cpp +++ b/test/rocprim/test_device_find_first_of.cpp @@ -35,6 +35,7 @@ // #include #include #include +#include #include #include @@ -47,6 +48,7 @@ // Params for tests template, class Config = rocprim::default_config, bool UseGraphs = false, @@ -55,29 +57,33 @@ struct DeviceFindFirstOfParams { using type = Type; using key_type = KeyType; + using output_type = OutputType; using compare_function = CompareFunction; using config = Config; static constexpr bool use_graphs = UseGraphs; static constexpr bool use_indirect_iterator = UseIndirectIterator; }; -// std::find_first_of is available since C++17 -template -InputIt - find_first_of_(InputIt first, InputIt last, ForwardIt s_first, ForwardIt s_last, BinaryPred p) +struct custom_compare1 { - for(; first != last; ++first) + template + ROCPRIM_HOST_DEVICE ROCPRIM_INLINE + bool operator()(const T& a, const U& b) const { - for(ForwardIt it = s_first; it != s_last; ++it) - { - if(p(*first, *it)) - { - return first; - } - } + // Since data is random, the chance of equality is negligible for floating point numbers + return static_cast(a * 1.234) == static_cast(b * 1.234); } - return last; -} +}; + +struct custom_compare2 +{ + template + ROCPRIM_HOST_DEVICE ROCPRIM_INLINE + bool operator()(test_utils::custom_test_type a, test_utils::custom_test_type b) + { + return a.x == b.x; + } +}; template class RocprimDeviceFindFirstOfTests : public ::testing::Test @@ -85,16 +91,44 @@ class RocprimDeviceFindFirstOfTests : public ::testing::Test public: using type = typename Params::type; using key_type = typename Params::key_type; + using output_type = typename Params::output_type; using compare_function = typename Params::compare_function; using config = typename Params::config; - const bool debug_synchronous = false; + static constexpr bool debug_synchronous = false; static constexpr bool use_graphs = Params::use_graphs; static constexpr bool use_indirect_iterator = Params::use_indirect_iterator; }; -using RocprimDeviceFindFirstOfTestsParams = ::testing::Types, - DeviceFindFirstOfParams, - DeviceFindFirstOfParams>; +using RocprimDeviceFindFirstOfTestsParams + = ::testing::Types, + DeviceFindFirstOfParams, + rocprim::default_config, + true, + true>, + DeviceFindFirstOfParams, + rocprim::default_config, + true, + false>, + DeviceFindFirstOfParams, + DeviceFindFirstOfParams, + test_utils::custom_test_type, + size_t, + custom_compare2, + rocprim::default_config, + false, + true>>; TYPED_TEST_SUITE(RocprimDeviceFindFirstOfTests, RocprimDeviceFindFirstOfTestsParams); @@ -104,12 +138,13 @@ TYPED_TEST(RocprimDeviceFindFirstOfTests, FindFirstOf) SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); HIP_CHECK(hipSetDevice(device_id)); - using type = typename TestFixture::type; - using key_type = typename TestFixture::key_type; - using output_type = size_t; - using compare_function = typename TestFixture::compare_function; - using config = typename TestFixture::config; - const bool debug_synchronous = TestFixture::debug_synchronous; + using type = typename TestFixture::type; + using key_type = typename TestFixture::key_type; + using output_type = typename TestFixture::output_type; + using compare_function = typename TestFixture::compare_function; + using config = typename TestFixture::config; + + constexpr bool debug_synchronous = TestFixture::debug_synchronous; constexpr bool use_indirect_iterator = TestFixture::use_indirect_iterator; for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) @@ -118,18 +153,17 @@ TYPED_TEST(RocprimDeviceFindFirstOfTests, FindFirstOf) = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; SCOPED_TRACE(testing::Message() << "with seed = " << seed_value); - std::cout << "with seed = " << seed_value << std::endl; - for(size_t size : test_utils::get_sizes(seed_value)) { + SCOPED_TRACE(testing::Message() << "with size = " << size); const size_t keys_size = std::sqrt(test_utils::get_random_value(0, size, seed_value)); - for(double starting_point : {0.0, 0.2, 0.8, 1.0}) + // Starting point is an appoximate position of the first match we want to test for + for(double starting_point : {0.0, 0.234, 0.876, 1.0, 100.0}) { - std::cout << "with size = " << size << ", keys_size = " << keys_size - << ", starting_point = " << starting_point << std::endl; + SCOPED_TRACE(testing::Message() << "with starting_point = " << starting_point); hipStream_t stream = 0; // default if(TestFixture::use_graphs) @@ -138,78 +172,61 @@ TYPED_TEST(RocprimDeviceFindFirstOfTests, FindFirstOf) HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); } - SCOPED_TRACE(testing::Message() << "with size = " << size); - // Generate data - std::vector input; - std::vector key_input; - if ROCPRIM_IF_CONSTEXPR(rocprim::is_floating_point::value) + auto keys = test_utils::get_random_data(keys_size, 0, 10, seed_value + 1); + + std::vector input(size); + // Generate the input data in such a way that it does not contain any values from + // keys before the starting point + const size_t size1 + = starting_point >= 1.0 ? size : static_cast(size * starting_point); + const size_t size2 = size - size1; + if(size1 > 0) { - // input = test_utils::get_random_data(size, -1000, 1000, seed_value); - // key_input - // = test_utils::get_random_data(keys_size, -1000, 1000, seed_value); + auto input1 = test_utils::get_random_data(size1, 20, 100, seed_value + 2); + std::copy(input1.begin(), input1.end(), input.begin()); } - else + if(size2 > 0) { - key_input = test_utils::get_random_data( - keys_size, - 0, - 10, - // test_utils::numeric_limits::min(), - // test_utils::numeric_limits::max(), - seed_value); - input = test_utils::get_random_data( - size, - 0, - 1000, - // test_utils::numeric_limits::min(), - // test_utils::numeric_limits::max(), - seed_value); - - if(size > 0 && keys_size > 0) + auto input2 = test_utils::get_random_data(size2, 0, 100, seed_value + 3); + std::copy(input2.begin(), input2.end(), input.begin() + size1); + } + + // Explicitly test for boundary cases + if(size > 0 && keys_size > 0) + { + if(starting_point == 0.0) { - // Change the input range before starting_point to ensure that it does not contain - // any values from keys - auto minmax_key_input - = std::minmax_element(key_input.begin(), key_input.end()); - const auto min_key_input = *minmax_key_input.first; - const auto max_key_input = *minmax_key_input.second; - const auto max_input - = *std::minmax_element(input.begin(), input.end()).second; - // std::cout << "min_key_input = " << min_key_input << ", max_key_input = " << max_key_input << ", max_input = " << max_input << std::endl; - for(size_t i = 0; i < size * starting_point; ++i) - { - if(min_key_input <= input[i] && input[i] <= max_key_input) - { - input[i] = max_input; - } - } + input[0] = keys[keys_size - 1]; + } + else if(starting_point == 1.0) + { + input[size - 1] = keys[0]; } } type* d_input; - key_type* d_key_input; + key_type* d_keys; output_type* d_output; HIP_CHECK( test_common_utils::hipMallocHelper(&d_input, input.size() * sizeof(*d_input))); HIP_CHECK( - test_common_utils::hipMallocHelper(&d_key_input, - key_input.size() * sizeof(*d_key_input))); + test_common_utils::hipMallocHelper(&d_keys, keys.size() * sizeof(*d_keys))); HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, sizeof(*d_output))); HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(*d_input), hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_key_input, - key_input.data(), - key_input.size() * sizeof(*d_key_input), + HIP_CHECK(hipMemcpy(d_keys, + keys.data(), + keys.size() * sizeof(*d_keys), hipMemcpyHostToDevice)); const auto input_it = test_utils::wrap_in_indirect_iterator(d_input); - const auto key_input_it - = test_utils::wrap_in_indirect_iterator(d_key_input); + const auto keys_it + = test_utils::wrap_in_indirect_iterator(d_keys); // compare function compare_function compare_op; @@ -221,10 +238,10 @@ TYPED_TEST(RocprimDeviceFindFirstOfTests, FindFirstOf) HIP_CHECK(rocprim::find_first_of(d_temp_storage, temp_storage_size_bytes, input_it, - key_input_it, + keys_it, d_output, input.size(), - key_input.size(), + keys.size(), compare_op, stream, debug_synchronous)); @@ -246,10 +263,10 @@ TYPED_TEST(RocprimDeviceFindFirstOfTests, FindFirstOf) HIP_CHECK(rocprim::find_first_of(d_temp_storage, temp_storage_size_bytes, input_it, - key_input_it, + keys_it, d_output, input.size(), - key_input.size(), + keys.size(), compare_op, stream, debug_synchronous)); @@ -269,20 +286,17 @@ TYPED_TEST(RocprimDeviceFindFirstOfTests, FindFirstOf) HIP_CHECK(hipMemcpy(&output, d_output, sizeof(*d_output), hipMemcpyDeviceToHost)); // Check - auto expected = find_first_of_(input.begin(), - input.end(), - key_input.begin(), - key_input.end(), - compare_op) + auto expected = std::find_first_of(input.begin(), + input.end(), + keys.begin(), + keys.end(), + compare_op) - input.begin(); - std::cout << "expected = " << expected << ", output = " << output << ", " - << double(expected) / double(size) << std::endl; - ASSERT_EQ(output, expected); HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_key_input)); + HIP_CHECK(hipFree(d_keys)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); @@ -295,3 +309,84 @@ TYPED_TEST(RocprimDeviceFindFirstOfTests, FindFirstOf) } } } + +TEST(RocprimDeviceFindFirstOfTests, LargeIndices) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id = " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using output_type = size_t; + using config = rocprim::default_config; + + constexpr bool debug_synchronous = false; + + for(size_t size : test_utils::get_large_sizes(seeds[0])) + { + SCOPED_TRACE(testing::Message() << "with size = " << size); + + const size_t keys_size = 12; + + for(double starting_point : {0.0, 0.12, 0.78, 1.1}) + { + SCOPED_TRACE(testing::Message() << "with starting_point = " << starting_point); + + hipStream_t stream = 0; // default + + output_type* d_output; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_output, sizeof(*d_output))); + + const output_type expected + = std::min(size, static_cast(starting_point * size)); + + auto input_it = rocprim::make_counting_iterator(size_t(0)); + auto keys_it = rocprim::make_counting_iterator(expected); + + rocprim::equal_to compare_op; + + // temp storage + size_t temp_storage_size_bytes; + void* d_temp_storage = nullptr; + // Get size of d_temp_storage + HIP_CHECK(rocprim::find_first_of(d_temp_storage, + temp_storage_size_bytes, + input_it, + keys_it, + d_output, + size, + keys_size, + compare_op, + stream, + debug_synchronous)); + + // temp_storage_size_bytes must be >0 + ASSERT_GT(temp_storage_size_bytes, 0); + + // allocate temporary storage + HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); + + // Run + HIP_CHECK(rocprim::find_first_of(d_temp_storage, + temp_storage_size_bytes, + input_it, + keys_it, + d_output, + size, + keys_size, + compare_op, + stream, + debug_synchronous)); + + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + // Copy output to host and check + output_type output; + HIP_CHECK(hipMemcpy(&output, d_output, sizeof(*d_output), hipMemcpyDeviceToHost)); + ASSERT_EQ(output, expected); + + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temp_storage)); + } + } +} From 6f4560b606a25968b640e76d112330f6e1cfa1c8 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Thu, 5 Sep 2024 14:40:13 +0500 Subject: [PATCH 6/8] find_first_of: Use ordered_block_id for better load balancing between CUs --- .../rocprim/device/device_find_first_of.hpp | 136 +++++++++++------- 1 file changed, 81 insertions(+), 55 deletions(-) diff --git a/rocprim/include/rocprim/device/device_find_first_of.hpp b/rocprim/include/rocprim/device/device_find_first_of.hpp index 320382f3b..80080b4f7 100644 --- a/rocprim/include/rocprim/device/device_find_first_of.hpp +++ b/rocprim/include/rocprim/device/device_find_first_of.hpp @@ -24,6 +24,7 @@ #include "../config.hpp" #include "../detail/temp_storage.hpp" #include "config_types.hpp" +#include "detail/ordered_block_id.hpp" #include "device_find_first_of_config.hpp" #include "device_transform.hpp" @@ -57,25 +58,23 @@ namespace detail } \ while(0) -#define RETURN_ON_ERROR(...) \ - do \ - { \ - hipError_t error = (__VA_ARGS__); \ - if(error != hipSuccess) \ - { \ - return error; \ - } \ - } \ - while(0) +ROCPRIM_KERNEL +static void + init_find_first_of_kernel(size_t* output, size_t size, ordered_block_id ordered_bid) +{ + *output = size; + ordered_bid.reset(); +} template ROCPRIM_KERNEL __launch_bounds__(device_params().kernel_config.block_size) -void find_first_of_kernel(InputIterator1 input, - InputIterator2 keys, - size_t* output, - size_t size, - size_t keys_size, - BinaryFunction compare_function) +void find_first_of_kernel(InputIterator1 input, + InputIterator2 keys, + size_t* output, + size_t size, + size_t keys_size, + ordered_block_id ordered_bid, + BinaryFunction compare_function) { constexpr find_first_of_config_params params = device_params(); @@ -87,14 +86,14 @@ void find_first_of_kernel(InputIterator1 input, using type = typename std::iterator_traits::value_type; using key_type = typename std::iterator_traits::value_type; - const unsigned int thread_id = ::rocprim::detail::block_thread_id<0>(); - const unsigned int block_id = ::rocprim::detail::block_id<0>(); - const unsigned int number_of_blocks = ::rocprim::detail::grid_size<0>(); + const unsigned int thread_id = ::rocprim::detail::block_thread_id<0>(); ROCPRIM_SHARED_MEMORY struct { unsigned int block_first_index; - size_t grid_first_index; + size_t global_first_index; + + typename decltype(ordered_bid)::storage_type ordered_bid; } storage; if(thread_id == 0) @@ -103,26 +102,27 @@ void find_first_of_kernel(InputIterator1 input, } syncthreads(); - size_t block_offset = block_id * items_per_block; - for(; block_offset < size; block_offset += number_of_blocks * items_per_block) + while(true) { if(thread_id == 0) { - storage.grid_first_index = atomic_load(output); + storage.global_first_index = atomic_load(output); } - syncthreads(); - if(storage.grid_first_index < block_offset) + const size_t block_id = ordered_bid.get(thread_id, storage.ordered_bid); + const size_t block_offset = block_id * items_per_block; + // ordered_bid.get() calls syncthreads(), it is safe to read global_first_index + + // Exit if all input has been processed or one of previous blocks has found a match + if(block_offset >= storage.global_first_index) { - // No need to continue if one of previous blocks (or this one) has found a match break; } - type items[items_per_thread]; - unsigned int thread_first_index = identity; if(block_offset + items_per_block <= size) { + type items[items_per_thread]; block_load_direct_striped(thread_id, input + block_offset, items); for(size_t key_index = 0; key_index < keys_size; ++key_index) { @@ -140,6 +140,8 @@ void find_first_of_kernel(InputIterator1 input, else { const unsigned int valid = size - block_offset; + + type items[items_per_thread]; block_load_direct_striped(thread_id, input + block_offset, items, valid); for(size_t key_index = 0; key_index < keys_size; ++key_index) { @@ -162,9 +164,13 @@ void find_first_of_kernel(InputIterator1 input, atomic_min(&storage.block_first_index, thread_first_index * block_size + thread_id); } syncthreads(); - if(thread_id == 0 && storage.block_first_index != identity) + if(storage.block_first_index != identity) { - atomic_min(output, block_offset + storage.block_first_index); + if(thread_id == 0) + { + atomic_min(output, block_offset + storage.block_first_index); + } + break; } } } @@ -201,51 +207,73 @@ hipError_t find_first_of_impl(void* temporary_storage, const unsigned int items_per_thread = params.kernel_config.items_per_thread; const unsigned int items_per_block = block_size * items_per_thread; - if(temporary_storage == nullptr) + using ordered_bid_type = ordered_block_id; + + // As output can be an arbitrary iterator, we need to use an intermediate buffer to do atomic + // operations with it + size_t* tmp_output = nullptr; + ordered_bid_type::id_type* ordered_bid_storage = nullptr; + + // Calculate required temporary storage + result = temp_storage::partition( + temporary_storage, + storage_size, + temp_storage::make_linear_partition( + temp_storage::ptr_aligned_array(&tmp_output, 1), + temp_storage::make_partition(&ordered_bid_storage, + ordered_bid_type::get_temp_storage_layout()))); + if(result != hipSuccess || temporary_storage == nullptr) { - storage_size = sizeof(size_t); - return hipSuccess; + return result; } - size_t* tmp_output = reinterpret_cast(temporary_storage); + auto ordered_bid = ordered_bid_type::create(ordered_bid_storage); - RETURN_ON_ERROR( - hipMemcpyAsync(tmp_output, &size, sizeof(*tmp_output), hipMemcpyHostToDevice, stream)); + std::chrono::steady_clock::time_point start; - if(size > 0) + if(debug_synchronous) { - std::chrono::steady_clock::time_point start; - if(debug_synchronous) - { - start = std::chrono::steady_clock::now(); - } + start = std::chrono::steady_clock::now(); + } + init_find_first_of_kernel<<<1, 1, 0, stream>>>(tmp_output, size, ordered_bid); + ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("init_find_first_of_kernel", 1, start); - const size_t shared_memory_size = 0; + if(size > 0 && keys_size > 0) + { auto kernel = find_first_of_kernel; + const size_t shared_memory_size = 0; + // Choose minimum grid size needed to achieve the highest occupancy int min_grid_size, max_block_size; - RETURN_ON_ERROR(hipOccupancyMaxPotentialBlockSize(&min_grid_size, - &max_block_size, - kernel, - shared_memory_size, - int(block_size))); + result = hipOccupancyMaxPotentialBlockSize(&min_grid_size, + &max_block_size, + kernel, + shared_memory_size, + int(block_size)); + if(result != hipSuccess) + { + return result; + } + const size_t num_blocks = std::min(size_t(min_grid_size), ceiling_div(size, items_per_block)); + if(debug_synchronous) + { + start = std::chrono::steady_clock::now(); + } kernel<<>>(input, keys, tmp_output, size, keys_size, + ordered_bid, compare_function); ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR("find_first_of_kernel", size, start); } - RETURN_ON_ERROR( - transform(tmp_output, output, 1, ::rocprim::identity(), stream, debug_synchronous)); - - return hipSuccess; + return transform(tmp_output, output, 1, ::rocprim::identity(), stream, debug_synchronous); } } // namespace detail @@ -289,8 +317,6 @@ hipError_t find_first_of_impl(void* temporary_storage, /// The signature of the function should be equivalent to the following: /// bool f(const T &a, const T &b);. The signature does not need to have /// const &, but function object must not modify the objects passed to it. -/// The comparator must meet the C++ named requirement Compare. -/// The default value is `BinaryFunction()`. /// \param [in] stream [optional] HIP stream object. Default is `0` (default stream). /// \param [in] debug_synchronous [optional] If true, synchronization after every kernel /// launch is forced in order to check for errors. Default value is `false`. @@ -329,7 +355,7 @@ hipError_t find_first_of_impl(void* temporary_storage, /// temporary_storage_ptr, temporary_storage_size_bytes, /// input, keys, output, size, keys_size /// ); -/// // possible output: [ 2 ] +/// // output: [ 2 ] /// \endcode /// \endparblock template Date: Thu, 5 Sep 2024 14:55:48 +0500 Subject: [PATCH 7/8] find_first_of: Implement autotuning --- benchmark/ConfigAutotuneSettings.cmake | 4 + benchmark/benchmark_device_find_first_of.cpp | 37 ++- benchmark/benchmark_device_find_first_of.hpp | 176 ----------- ...hmark_device_find_first_of.parallel.cpp.in | 30 ++ ...enchmark_device_find_first_of.parallel.hpp | 282 ++++++++++++++++++ .../detail/config/device_find_first_of.hpp | 263 ++++++++++++++++ .../device/detail/device_config_helper.hpp | 18 +- .../device/device_find_first_of_config.hpp | 5 +- scripts/autotune/create_optimization.py | 10 + .../templates/find_first_of_config_template | 20 ++ 10 files changed, 659 insertions(+), 186 deletions(-) delete mode 100644 benchmark/benchmark_device_find_first_of.hpp create mode 100644 benchmark/benchmark_device_find_first_of.parallel.cpp.in create mode 100644 benchmark/benchmark_device_find_first_of.parallel.hpp create mode 100644 rocprim/include/rocprim/device/detail/config/device_find_first_of.hpp create mode 100644 scripts/autotune/templates/find_first_of_config_template diff --git a/benchmark/ConfigAutotuneSettings.cmake b/benchmark/ConfigAutotuneSettings.cmake index c014a5b57..8c18e334d 100644 --- a/benchmark/ConfigAutotuneSettings.cmake +++ b/benchmark/ConfigAutotuneSettings.cmake @@ -111,5 +111,9 @@ DataType;BlockSize;" PARENT_SCOPE) set(list_across_names "KeyType;ValueType;BlockSize;TilesPerBlock" PARENT_SCOPE) set(list_across "${LIMITED_TUNING_TYPES};${TUNING_TYPES};128 192 256 384 512;1 2" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@TilesPerBlock@" PARENT_SCOPE) + elseif(file STREQUAL "benchmark_device_find_first_of") + set(list_across_names "DataType;BlockSize" PARENT_SCOPE) + set(list_across "${LIMITED_TUNING_TYPES};32 64 128 256 512 1024" PARENT_SCOPE) + set(output_pattern_suffix "@DataType@_@BlockSize@" PARENT_SCOPE) endif() endfunction() diff --git a/benchmark/benchmark_device_find_first_of.cpp b/benchmark/benchmark_device_find_first_of.cpp index 67606e9fa..e48a603df 100644 --- a/benchmark/benchmark_device_find_first_of.cpp +++ b/benchmark/benchmark_device_find_first_of.cpp @@ -20,7 +20,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include "benchmark_device_find_first_of.hpp" +#include "benchmark_device_find_first_of.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser @@ -35,8 +35,8 @@ #include #include -#ifndef DEFAULT_N -const size_t DEFAULT_N = 1024 * 1024 * 32; +#ifndef DEFAULT_BYTES +constexpr size_t DEFAULT_BYTES = size_t{1} << 27; // 128 MiB #endif #define CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, FIRST_OCCURENCE) \ @@ -55,9 +55,10 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #define CREATE_BENCHMARK(TYPE) \ { \ + CREATE_BENCHMARK0(TYPE, 1) \ CREATE_BENCHMARK0(TYPE, 10) \ - CREATE_BENCHMARK0(TYPE, 128) \ - CREATE_BENCHMARK0(TYPE, 1024) \ + CREATE_BENCHMARK0(TYPE, 100) \ + CREATE_BENCHMARK0(TYPE, 1000) \ CREATE_BENCHMARK0(TYPE, 10000) \ } // clang-format on @@ -65,13 +66,24 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); +#ifdef BENCHMARK_CONFIG_TUNING + // optionally run an evenly split subset of benchmarks, when making multiple program invocations + parser.set_optional("parallel_instance", + "parallel_instance", + 0, + "parallel instance index"); + parser.set_optional("parallel_instances", + "parallel_instances", + 1, + "total parallel instances"); +#endif parser.run_and_exit_if_error(); // Parse argv @@ -92,16 +104,29 @@ int main(int argc, char* argv[]) // Add benchmarks std::vector benchmarks{}; +#ifdef BENCHMARK_CONFIG_TUNING + const int parallel_instance = parser.get("parallel_instance"); + const int parallel_instances = parser.get("parallel_instances"); + config_autotune_register::register_benchmark_subset(benchmarks, + parallel_instance, + parallel_instances, + size, + seed, + stream); +#else // BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(int16_t) CREATE_BENCHMARK(int32_t) + CREATE_BENCHMARK(float) CREATE_BENCHMARK(int64_t) + CREATE_BENCHMARK(double) using custom_int2 = custom_type; using custom_longlong_double = custom_type; CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_longlong_double) +#endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_device_find_first_of.hpp b/benchmark/benchmark_device_find_first_of.hpp deleted file mode 100644 index 970f911ec..000000000 --- a/benchmark/benchmark_device_find_first_of.hpp +++ /dev/null @@ -1,176 +0,0 @@ -// MIT License -// -// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#ifndef ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_HPP_ -#define ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_HPP_ - -#include "benchmark_utils.hpp" - -// Google Benchmark -#include - -// HIP API -#include - -// rocPRIM -#include - -#include -#include -#include - -template -struct device_find_first_of_benchmark : public config_autotune_interface -{ - size_t keys_size; - double first_occurence; - - device_find_first_of_benchmark(size_t keys_size, double first_occurence) - : keys_size(keys_size), first_occurence(first_occurence) - {} - - std::string name() const override - { - using namespace std::string_literals; - return bench_naming::format_name( - "{lvl:device,algo:find_first_of,keys_size:" + std::to_string(keys_size) - + ",first_occurence:" + std::to_string(first_occurence) - + ",key_type:" + std::string(Traits::name()) + ",cfg:default_config}"); - } - - static constexpr unsigned int batch_size = 10; - static constexpr unsigned int warmup_size = 5; - - void run(benchmark::State& state, - size_t size, - const managed_seed& seed, - hipStream_t stream) const override - { - using type = Key; - using key_type = Key; - using output_type = size_t; - - // Generate data - std::vector key_input - = get_random_data(keys_size, 0, 100, seed.get_0()); - std::vector input - = get_random_data(size, 101, generate_limits::max(), seed.get_0()); - - // Set the first occurence of keys in input - const size_t p = static_cast(size * first_occurence); - if(p < size) - { - input[p] = key_input[keys_size / 2]; - } - - type* d_input; - key_type* d_key_input; - output_type* d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); - HIP_CHECK(hipMalloc(&d_key_input, size * sizeof(*d_key_input))); - HIP_CHECK(hipMalloc(&d_output, sizeof(*d_output))); - - HIP_CHECK(hipMemcpy(d_input, - input.data(), - input.size() * sizeof(*d_input), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_key_input, - key_input.data(), - key_input.size() * sizeof(*d_key_input), - hipMemcpyHostToDevice)); - - ::rocprim::equal_to compare_op; - - void* d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - - auto run = [&]() - { - HIP_CHECK(rocprim::find_first_of(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_key_input, - d_output, - input.size(), - key_input.size(), - compare_op, - stream)); - }; - - run(); - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - run(); - } - HIP_CHECK(hipDeviceSynchronize()); - - // HIP events creation - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - - for(auto _ : state) - { - // Record start event - HIP_CHECK(hipEventRecord(start, stream)); - - for(size_t i = 0; i < batch_size; i++) - { - run(); - } - - // Record stop event and wait until it completes - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed_mseconds; - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); - state.SetIterationTime(elapsed_mseconds / 1000); - } - - // Destroy HIP events - HIP_CHECK(hipEventDestroy(start)); - HIP_CHECK(hipEventDestroy(stop)); - - // Only a part of data (before the first occurence) must be actually processed - const size_t effective_size = static_cast(size * first_occurence); - state.SetBytesProcessed(state.iterations() * batch_size * effective_size - * sizeof(*d_input)); - state.SetItemsProcessed(state.iterations() * batch_size * effective_size); - // All threads of all blocks read the same keys so this value is limited by cache bandwidth - state.counters["bytes_per_second_keys"] = benchmark::Counter( - static_cast(state.iterations() * batch_size * effective_size * keys_size - * sizeof(*d_key_input)), - benchmark::Counter::kIsRate, - benchmark::Counter::kIs1024); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_key_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_temporary_storage)); - } -}; - -#endif // ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_HPP_ diff --git a/benchmark/benchmark_device_find_first_of.parallel.cpp.in b/benchmark/benchmark_device_find_first_of.parallel.cpp.in new file mode 100644 index 000000000..230f6640c --- /dev/null +++ b/benchmark/benchmark_device_find_first_of.parallel.cpp.in @@ -0,0 +1,30 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "benchmark_device_find_first_of.parallel.hpp" +#include "benchmark_utils.hpp" + +namespace +{ +auto benchmarks = config_autotune_register::create_bulk( + device_find_first_of_benchmark_generator<@DataType@, @BlockSize@>::create); +} // namespace diff --git a/benchmark/benchmark_device_find_first_of.parallel.hpp b/benchmark/benchmark_device_find_first_of.parallel.hpp new file mode 100644 index 000000000..5681fe69e --- /dev/null +++ b/benchmark/benchmark_device_find_first_of.parallel.hpp @@ -0,0 +1,282 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ +#define ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ + +#include "benchmark_utils.hpp" + +// Google Benchmark +#include + +// HIP API +#include + +// rocPRIM +#include + +#include +#include +#include + +template +std::string config_name() +{ + const rocprim::detail::find_first_of_config_params config = Config(); + return "{bs:" + std::to_string(config.kernel_config.block_size) + + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + "}"; +} + +template<> +inline std::string config_name() +{ + return "default_config"; +} + +template +struct device_find_first_of_benchmark : public config_autotune_interface +{ + std::vector keys_sizes; + std::vector first_occurrences; + + device_find_first_of_benchmark(size_t keys_size, double first_occurrence) + { + keys_sizes.push_back(keys_size); + first_occurrences.push_back(first_occurrence); + } + + device_find_first_of_benchmark(const std::vector& keys_sizes, + const std::vector& first_occurrences) + { + this->keys_sizes = keys_sizes; + this->first_occurrences = first_occurrences; + } + + std::string name() const override + { + using namespace std::string_literals; + return bench_naming::format_name( + "{lvl:device,algo:find_first_of,"s + + (keys_sizes.size() == 1 ? "keys_size:"s + std::to_string(keys_sizes[0]) : ""s) + + (first_occurrences.size() == 1 + ? ",first_occurrence:"s + std::to_string(first_occurrences[0]) + : ""s) + + ",value_type:"s + std::string(Traits::name()) + ",cfg:" + config_name() + + "}"); + } + + static constexpr unsigned int batch_size = 10; + static constexpr unsigned int warmup_size = 2; + + void run(benchmark::State& state, + size_t bytes, + const managed_seed& seed, + hipStream_t stream) const override + { + using type = T; + using key_type = T; + using output_type = size_t; + + const size_t size = bytes / sizeof(type); + + const size_t max_keys_size = *std::max_element(keys_sizes.begin(), keys_sizes.end()); + + // Generate data + std::vector key_input + = get_random_data(max_keys_size, 0, 100, seed.get_0()); + std::vector input + = get_random_data(size, 101, generate_limits::max(), seed.get_0()); + + std::vector d_inputs(first_occurrences.size()); + for(size_t fi = 0; fi < first_occurrences.size(); ++fi) + { + type* d_input; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); + HIP_CHECK(hipMemcpyAsync(d_input, + input.data(), + input.size() * sizeof(*d_input), + hipMemcpyHostToDevice, + stream)); + // Set the first occurrence of keys in input + const size_t p = static_cast(size * first_occurrences[fi]); + if(p < size) + { + const type key = key_input[0]; + HIP_CHECK(hipMemcpyAsync(d_input + p, + &key, + sizeof(*d_input), + hipMemcpyHostToDevice, + stream)); + } + d_inputs[fi] = d_input; + } + + key_type* d_key_input; + output_type* d_output; + HIP_CHECK(hipMalloc(&d_key_input, max_keys_size * sizeof(*d_key_input))); + HIP_CHECK(hipMalloc(&d_output, sizeof(*d_output))); + + HIP_CHECK(hipMemcpy(d_key_input, + key_input.data(), + key_input.size() * sizeof(*d_key_input), + hipMemcpyHostToDevice)); + + ::rocprim::equal_to compare_op; + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + + auto run = [&](size_t key_size, const type* d_input) + { + HIP_CHECK(rocprim::find_first_of(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_key_input, + d_output, + input.size(), + key_size, + compare_op, + stream)); + }; + + size_t max_temporary_storage_bytes = 0; + for(size_t keys_size : keys_sizes) + { + run(keys_size, d_inputs[0]); + max_temporary_storage_bytes + = std::max(max_temporary_storage_bytes, temporary_storage_bytes); + } + temporary_storage_bytes = max_temporary_storage_bytes; + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + for(size_t fi = 0; fi < first_occurrences.size(); ++fi) + { + for(size_t keys_size : keys_sizes) + { + run(keys_size, d_inputs[fi]); + } + } + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + for(auto _ : state) + { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + for(size_t i = 0; i < batch_size; i++) + { + for(size_t fi = 0; fi < first_occurrences.size(); ++fi) + { + for(size_t keys_size : keys_sizes) + { + run(keys_size, d_inputs[fi]); + } + } + } + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + + // Destroy HIP events + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); + + // Only a part of data (before the first occurrence) must be actually processed. In ideal + // cases when no thread blocks do unneeded work (i.e. exit early once the match is found), + // performance for different values of first_occurrence must be similar. + size_t sum_effective_size = 0; + for(double first_occurrence : first_occurrences) + { + sum_effective_size += static_cast(size * first_occurrence); + } + size_t sum_keys_size = 0; + for(size_t keys_size : keys_sizes) + { + sum_keys_size += keys_size; + } + state.SetBytesProcessed(state.iterations() * batch_size * sum_effective_size + * sizeof(*d_inputs[0])); + state.SetItemsProcessed(state.iterations() * batch_size * sum_effective_size); + // Each input is read once but all keys are read by all threads so performance is likely + // compute-bound or bound by cache bandwidth for reading keys rather than reading inputs. + // Let's additionally report the rate of comparisons to see if it reaches a plateau with + // increasing keys_size. + state.counters["comparisons_per_second"] + = benchmark::Counter(static_cast(state.iterations() * batch_size + * sum_effective_size * sum_keys_size), + benchmark::Counter::kIsRate); + + for(size_t fi = 0; fi < first_occurrences.size(); ++fi) + { + HIP_CHECK(hipFree(d_inputs[fi])); + } + HIP_CHECK(hipFree(d_key_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temporary_storage)); + } +}; + +template +struct device_find_first_of_benchmark_generator +{ + + template + struct create_ipt + { + using generated_config = rocprim::find_first_of_config; + + void operator()(std::vector>& storage) + { + std::vector keys_sizes{1, 10, 100, 1000}; + std::vector first_occurrences{0.1, 0.5, 1.0}; + storage.emplace_back( + std::make_unique>( + keys_sizes, + first_occurrences)); + } + }; + + static void create(std::vector>& storage) + { + static constexpr unsigned int min_items_per_thread = 1; + static constexpr unsigned int max_items_per_thread = 16; + static_for_each, + create_ipt>(storage); + } +}; + +#endif // ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ diff --git a/rocprim/include/rocprim/device/detail/config/device_find_first_of.hpp b/rocprim/include/rocprim/device/detail/config/device_find_first_of.hpp new file mode 100644 index 000000000..ce06277a8 --- /dev/null +++ b/rocprim/include/rocprim/device/detail/config/device_find_first_of.hpp @@ -0,0 +1,263 @@ +// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_FIND_FIRST_OF_HPP_ +#define ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_FIND_FIRST_OF_HPP_ + +#include "../../../type_traits.hpp" +#include "../device_config_helper.hpp" + +#include + +/* DO NOT EDIT THIS FILE + * This file is automatically generated by `/scripts/autotune/create_optimization.py`. + * so most likely you want to edit rocprim/device/device_(algo)_config.hpp + */ + +/// \addtogroup primitivesmodule_deviceconfigs +/// @{ + +BEGIN_ROCPRIM_NAMESPACE + +namespace detail +{ + +template +struct default_find_first_of_config : default_find_first_of_config_base::type +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<256, 12> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 12> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx1030), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<64, 15> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1100), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 9> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1100), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<128, 13> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1100), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 9> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx1100), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<64, 13> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 15> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<1024, 14> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<64, 13> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx906), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<256, 11> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 8> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 11> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx908), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 8> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 11> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::unknown), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 8> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 11> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx90a), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<256, 10> +{}; + +} // end namespace detail + +END_ROCPRIM_NAMESPACE + +/// @} +// end of group primitivesmodule_deviceconfigs + +#endif // ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_FIND_FIRST_OF_HPP_ diff --git a/rocprim/include/rocprim/device/detail/device_config_helper.hpp b/rocprim/include/rocprim/device/detail/device_config_helper.hpp index c539704c5..095f4f50f 100644 --- a/rocprim/include/rocprim/device/detail/device_config_helper.hpp +++ b/rocprim/include/rocprim/device/detail/device_config_helper.hpp @@ -1087,7 +1087,7 @@ namespace detail struct find_first_of_config_params { - kernel_config_params kernel_config; + kernel_config_params kernel_config{}; }; } // namespace detail @@ -1102,12 +1102,26 @@ struct find_first_of_config : public detail::find_first_of_config_params #ifndef DOXYGEN_SHOULD_SKIP_THIS constexpr find_first_of_config() : detail::find_first_of_config_params{ - {BlockSize, ItemsPerThread, ROCPRIM_GRID_SIZE_LIMIT} + {BlockSize, ItemsPerThread, 0} } {} #endif }; +namespace detail +{ + +template +struct default_find_first_of_config_base +{ + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + + using type = find_first_of_config<256, ::rocprim::max(1u, 16u / item_scale)>; +}; + +} // namespace detail + END_ROCPRIM_NAMESPACE /// @} diff --git a/rocprim/include/rocprim/device/device_find_first_of_config.hpp b/rocprim/include/rocprim/device/device_find_first_of_config.hpp index 7996329f5..26c0a30d7 100644 --- a/rocprim/include/rocprim/device/device_find_first_of_config.hpp +++ b/rocprim/include/rocprim/device/device_find_first_of_config.hpp @@ -22,7 +22,7 @@ #define ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_CONFIG_HPP_ #include "config_types.hpp" - +#include "detail/config/device_find_first_of.hpp" #include "detail/device_config_helper.hpp" /// \addtogroup primitivesmodule_deviceconfigs @@ -51,7 +51,8 @@ struct wrapped_find_first_of_config template struct architecture_config { - static constexpr find_first_of_config_params params = {kernel_config<256, 4>()}; + static constexpr find_first_of_config_params params + = default_find_first_of_config(Arch), Type>(); }; }; diff --git a/scripts/autotune/create_optimization.py b/scripts/autotune/create_optimization.py index 130bdb3cf..2bebf1fd5 100755 --- a/scripts/autotune/create_optimization.py +++ b/scripts/autotune/create_optimization.py @@ -622,6 +622,14 @@ class AlgorithmDeviceReduceByKey(Algorithm): def __init__(self, fallback_entries): Algorithm.__init__(self, fallback_entries) +class AlgorithmDeviceFindFirstOf(Algorithm): + algorithm_name = "device_find_first_of" + cpp_configuration_template_name = "find_first_of_config_template" + config_selection_params = [ + SelectionType(name="value_type", is_optional=False, select_on_size_only=True)] + def __init__(self, fallback_entries): + Algorithm.__init__(self, fallback_entries) + def filt_algo_regex(e: FallbackCase, algorithm_name): if e.algo_regex: return re.match(e.algo_regex, algorithm_name) is not None @@ -679,6 +687,8 @@ def create_algorithm(algorithm_name: str, fallback_entries: List[FallbackCase]): return AlgorithmDeviceSelectUniqueByKey(fallback_entries) elif algorithm_name == 'device_reduce_by_key': return AlgorithmDeviceReduceByKey(fallback_entries) + elif algorithm_name == 'device_find_first_of': + return AlgorithmDeviceFindFirstOf(fallback_entries) else: raise(NotSupportedError(f'Algorithm "{algorithm_name}" is not supported (yet)')) diff --git a/scripts/autotune/templates/find_first_of_config_template b/scripts/autotune/templates/find_first_of_config_template new file mode 100644 index 000000000..84e7fb476 --- /dev/null +++ b/scripts/autotune/templates/find_first_of_config_template @@ -0,0 +1,20 @@ +{% extends "config_template" %} + +{% macro get_header_guard() %} +ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_FIND_FIRST_OF_HPP_ +{%- endmacro %} + +{% macro kernel_configuration(measurement) -%} +find_first_of_config<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}> { }; +{%- endmacro %} + +{% macro general_case() -%} +template +struct default_find_first_of_config : default_find_first_of_config_base::type +{}; +{%- endmacro %} + +{% macro configuration_fallback(benchmark_of_architecture, based_on_type, fallback_selection_criteria) -%} +// Based on {{ based_on_type }} +template struct default_find_first_of_config({{ benchmark_of_architecture.name }}), value_type, {{ fallback_selection_criteria }}> : +{%- endmacro %} From 1cce46ba08ad7567c8ea8d85aa13b092502f4bd9 Mon Sep 17 00:00:00 2001 From: Anton Gorenko Date: Thu, 5 Sep 2024 15:21:43 +0500 Subject: [PATCH 8/8] find_first_of: Add docs --- CHANGELOG.md | 2 ++ docs/device_ops/find_first_of.rst | 19 +++++++++++++++++++ docs/device_ops/index.rst | 1 + docs/reference/ops_summary.rst | 5 +++++ docs/sphinx/_toc.yml.in | 1 + 5 files changed, 28 insertions(+) create mode 100644 docs/device_ops/find_first_of.rst diff --git a/CHANGELOG.md b/CHANGELOG.md index ff6db71e0..ed41489b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ Documentation for rocPRIM is available at ### Additions +* Added the parallel `find_first_of` device function with autotuned configurations, this function is similar to `std::find_first_of`, it searches for the first occurrence of any of the provided elements. + ### Changes ### Fixes diff --git a/docs/device_ops/find_first_of.rst b/docs/device_ops/find_first_of.rst new file mode 100644 index 000000000..1781ae067 --- /dev/null +++ b/docs/device_ops/find_first_of.rst @@ -0,0 +1,19 @@ +.. meta:: + :description: rocPRIM documentation and API reference library + :keywords: rocPRIM, ROCm, API, documentation + +.. _dev-find_first_of: + + +Find first of +------------- + +Configuring the kernel +~~~~~~~~~~~~~~~~~~~~~~ + +.. doxygenstruct:: rocprim::find_first_of_config + +find_first_of +~~~~~~~~~~~~~ + +.. doxygenfunction:: rocprim::find_first_of(void* temporary_storage, size_t& storage_size, InputIterator1 input, InputIterator2 keys, OutputIterator output, size_t size, size_t keys_size, BinaryFunction compare_function = BinaryFunction(), hipStream_t stream = 0, bool debug_synchronous = false) diff --git a/docs/device_ops/index.rst b/docs/device_ops/index.rst index 3c27a1c15..74db4ee48 100644 --- a/docs/device_ops/index.rst +++ b/docs/device_ops/index.rst @@ -25,3 +25,4 @@ * :ref:`dev-memcpy` * :ref:`dev-nth_element` * :ref:`dev-partial_sort` + * :ref:`dev-find_first_of` diff --git a/docs/reference/ops_summary.rst b/docs/reference/ops_summary.rst index 9dbf13d68..9121e2e31 100644 --- a/docs/reference/ops_summary.rst +++ b/docs/reference/ops_summary.rst @@ -50,6 +50,11 @@ Data Movement * ``load`` the complementary operations of the above ones. * ``memcpy`` copies bytes between device sources and destinations +Sequence Search +=============== + +* ``find_first_of`` searches for the first occurrence of any of the provided elements. + Other operations ====================== diff --git a/docs/sphinx/_toc.yml.in b/docs/sphinx/_toc.yml.in index 29e2bf154..b270acd5c 100644 --- a/docs/sphinx/_toc.yml.in +++ b/docs/sphinx/_toc.yml.in @@ -36,6 +36,7 @@ subtrees: - file: device_ops/histogram.rst - file: device_ops/device_copy.rst - file: device_ops/memcpy.rst + - file: device_ops/find_first_of.rst - file: block_ops/index.rst subtrees: - entries: