diff --git a/benchmark/ConfigAutotuneSettings.cmake b/benchmark/ConfigAutotuneSettings.cmake index c014a5b57..8c18e334d 100644 --- a/benchmark/ConfigAutotuneSettings.cmake +++ b/benchmark/ConfigAutotuneSettings.cmake @@ -111,5 +111,9 @@ DataType;BlockSize;" PARENT_SCOPE) set(list_across_names "KeyType;ValueType;BlockSize;TilesPerBlock" PARENT_SCOPE) set(list_across "${LIMITED_TUNING_TYPES};${TUNING_TYPES};128 192 256 384 512;1 2" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@TilesPerBlock@" PARENT_SCOPE) + elseif(file STREQUAL "benchmark_device_find_first_of") + set(list_across_names "DataType;BlockSize" PARENT_SCOPE) + set(list_across "${LIMITED_TUNING_TYPES};32 64 128 256 512 1024" PARENT_SCOPE) + set(output_pattern_suffix "@DataType@_@BlockSize@" PARENT_SCOPE) endif() endfunction() diff --git a/benchmark/benchmark_device_find_first_of.cpp b/benchmark/benchmark_device_find_first_of.cpp index 67606e9fa..e48a603df 100644 --- a/benchmark/benchmark_device_find_first_of.cpp +++ b/benchmark/benchmark_device_find_first_of.cpp @@ -20,7 +20,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include "benchmark_device_find_first_of.hpp" +#include "benchmark_device_find_first_of.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser @@ -35,8 +35,8 @@ #include #include -#ifndef DEFAULT_N -const size_t DEFAULT_N = 1024 * 1024 * 32; +#ifndef DEFAULT_BYTES +constexpr size_t DEFAULT_BYTES = size_t{1} << 27; // 128 MiB #endif #define CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, FIRST_OCCURENCE) \ @@ -55,9 +55,10 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; #define CREATE_BENCHMARK(TYPE) \ { \ + CREATE_BENCHMARK0(TYPE, 1) \ CREATE_BENCHMARK0(TYPE, 10) \ - CREATE_BENCHMARK0(TYPE, 128) \ - CREATE_BENCHMARK0(TYPE, 1024) \ + CREATE_BENCHMARK0(TYPE, 100) \ + CREATE_BENCHMARK0(TYPE, 1000) \ CREATE_BENCHMARK0(TYPE, 10000) \ } // clang-format on @@ -65,13 +66,24 @@ const size_t DEFAULT_N = 1024 * 1024 * 32; int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); - parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("size", "size", DEFAULT_BYTES, "number of bytes"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.set_optional("seed", "seed", "random", get_seed_message()); +#ifdef BENCHMARK_CONFIG_TUNING + // optionally run an evenly split subset of benchmarks, when making multiple program invocations + parser.set_optional("parallel_instance", + "parallel_instance", + 0, + "parallel instance index"); + parser.set_optional("parallel_instances", + "parallel_instances", + 1, + "total parallel instances"); +#endif parser.run_and_exit_if_error(); // Parse argv @@ -92,16 +104,29 @@ int main(int argc, char* argv[]) // Add benchmarks std::vector benchmarks{}; +#ifdef BENCHMARK_CONFIG_TUNING + const int parallel_instance = parser.get("parallel_instance"); + const int parallel_instances = parser.get("parallel_instances"); + config_autotune_register::register_benchmark_subset(benchmarks, + parallel_instance, + parallel_instances, + size, + seed, + stream); +#else // BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(int16_t) CREATE_BENCHMARK(int32_t) + CREATE_BENCHMARK(float) CREATE_BENCHMARK(int64_t) + CREATE_BENCHMARK(double) using custom_int2 = custom_type; using custom_longlong_double = custom_type; CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_longlong_double) +#endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) diff --git a/benchmark/benchmark_device_find_first_of.hpp b/benchmark/benchmark_device_find_first_of.hpp deleted file mode 100644 index 970f911ec..000000000 --- a/benchmark/benchmark_device_find_first_of.hpp +++ /dev/null @@ -1,176 +0,0 @@ -// MIT License -// -// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#ifndef ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_HPP_ -#define ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_HPP_ - -#include "benchmark_utils.hpp" - -// Google Benchmark -#include - -// HIP API -#include - -// rocPRIM -#include - -#include -#include -#include - -template -struct device_find_first_of_benchmark : public config_autotune_interface -{ - size_t keys_size; - double first_occurence; - - device_find_first_of_benchmark(size_t keys_size, double first_occurence) - : keys_size(keys_size), first_occurence(first_occurence) - {} - - std::string name() const override - { - using namespace std::string_literals; - return bench_naming::format_name( - "{lvl:device,algo:find_first_of,keys_size:" + std::to_string(keys_size) - + ",first_occurence:" + std::to_string(first_occurence) - + ",key_type:" + std::string(Traits::name()) + ",cfg:default_config}"); - } - - static constexpr unsigned int batch_size = 10; - static constexpr unsigned int warmup_size = 5; - - void run(benchmark::State& state, - size_t size, - const managed_seed& seed, - hipStream_t stream) const override - { - using type = Key; - using key_type = Key; - using output_type = size_t; - - // Generate data - std::vector key_input - = get_random_data(keys_size, 0, 100, seed.get_0()); - std::vector input - = get_random_data(size, 101, generate_limits::max(), seed.get_0()); - - // Set the first occurence of keys in input - const size_t p = static_cast(size * first_occurence); - if(p < size) - { - input[p] = key_input[keys_size / 2]; - } - - type* d_input; - key_type* d_key_input; - output_type* d_output; - HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); - HIP_CHECK(hipMalloc(&d_key_input, size * sizeof(*d_key_input))); - HIP_CHECK(hipMalloc(&d_output, sizeof(*d_output))); - - HIP_CHECK(hipMemcpy(d_input, - input.data(), - input.size() * sizeof(*d_input), - hipMemcpyHostToDevice)); - HIP_CHECK(hipMemcpy(d_key_input, - key_input.data(), - key_input.size() * sizeof(*d_key_input), - hipMemcpyHostToDevice)); - - ::rocprim::equal_to compare_op; - - void* d_temporary_storage = nullptr; - size_t temporary_storage_bytes = 0; - - auto run = [&]() - { - HIP_CHECK(rocprim::find_first_of(d_temporary_storage, - temporary_storage_bytes, - d_input, - d_key_input, - d_output, - input.size(), - key_input.size(), - compare_op, - stream)); - }; - - run(); - HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); - - // Warm-up - for(size_t i = 0; i < warmup_size; i++) - { - run(); - } - HIP_CHECK(hipDeviceSynchronize()); - - // HIP events creation - hipEvent_t start, stop; - HIP_CHECK(hipEventCreate(&start)); - HIP_CHECK(hipEventCreate(&stop)); - - for(auto _ : state) - { - // Record start event - HIP_CHECK(hipEventRecord(start, stream)); - - for(size_t i = 0; i < batch_size; i++) - { - run(); - } - - // Record stop event and wait until it completes - HIP_CHECK(hipEventRecord(stop, stream)); - HIP_CHECK(hipEventSynchronize(stop)); - - float elapsed_mseconds; - HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); - state.SetIterationTime(elapsed_mseconds / 1000); - } - - // Destroy HIP events - HIP_CHECK(hipEventDestroy(start)); - HIP_CHECK(hipEventDestroy(stop)); - - // Only a part of data (before the first occurence) must be actually processed - const size_t effective_size = static_cast(size * first_occurence); - state.SetBytesProcessed(state.iterations() * batch_size * effective_size - * sizeof(*d_input)); - state.SetItemsProcessed(state.iterations() * batch_size * effective_size); - // All threads of all blocks read the same keys so this value is limited by cache bandwidth - state.counters["bytes_per_second_keys"] = benchmark::Counter( - static_cast(state.iterations() * batch_size * effective_size * keys_size - * sizeof(*d_key_input)), - benchmark::Counter::kIsRate, - benchmark::Counter::kIs1024); - - HIP_CHECK(hipFree(d_input)); - HIP_CHECK(hipFree(d_key_input)); - HIP_CHECK(hipFree(d_output)); - HIP_CHECK(hipFree(d_temporary_storage)); - } -}; - -#endif // ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_HPP_ diff --git a/benchmark/benchmark_device_find_first_of.parallel.cpp.in b/benchmark/benchmark_device_find_first_of.parallel.cpp.in new file mode 100644 index 000000000..230f6640c --- /dev/null +++ b/benchmark/benchmark_device_find_first_of.parallel.cpp.in @@ -0,0 +1,30 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "benchmark_device_find_first_of.parallel.hpp" +#include "benchmark_utils.hpp" + +namespace +{ +auto benchmarks = config_autotune_register::create_bulk( + device_find_first_of_benchmark_generator<@DataType@, @BlockSize@>::create); +} // namespace diff --git a/benchmark/benchmark_device_find_first_of.parallel.hpp b/benchmark/benchmark_device_find_first_of.parallel.hpp new file mode 100644 index 000000000..5681fe69e --- /dev/null +++ b/benchmark/benchmark_device_find_first_of.parallel.hpp @@ -0,0 +1,282 @@ +// MIT License +// +// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ +#define ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ + +#include "benchmark_utils.hpp" + +// Google Benchmark +#include + +// HIP API +#include + +// rocPRIM +#include + +#include +#include +#include + +template +std::string config_name() +{ + const rocprim::detail::find_first_of_config_params config = Config(); + return "{bs:" + std::to_string(config.kernel_config.block_size) + + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + "}"; +} + +template<> +inline std::string config_name() +{ + return "default_config"; +} + +template +struct device_find_first_of_benchmark : public config_autotune_interface +{ + std::vector keys_sizes; + std::vector first_occurrences; + + device_find_first_of_benchmark(size_t keys_size, double first_occurrence) + { + keys_sizes.push_back(keys_size); + first_occurrences.push_back(first_occurrence); + } + + device_find_first_of_benchmark(const std::vector& keys_sizes, + const std::vector& first_occurrences) + { + this->keys_sizes = keys_sizes; + this->first_occurrences = first_occurrences; + } + + std::string name() const override + { + using namespace std::string_literals; + return bench_naming::format_name( + "{lvl:device,algo:find_first_of,"s + + (keys_sizes.size() == 1 ? "keys_size:"s + std::to_string(keys_sizes[0]) : ""s) + + (first_occurrences.size() == 1 + ? ",first_occurrence:"s + std::to_string(first_occurrences[0]) + : ""s) + + ",value_type:"s + std::string(Traits::name()) + ",cfg:" + config_name() + + "}"); + } + + static constexpr unsigned int batch_size = 10; + static constexpr unsigned int warmup_size = 2; + + void run(benchmark::State& state, + size_t bytes, + const managed_seed& seed, + hipStream_t stream) const override + { + using type = T; + using key_type = T; + using output_type = size_t; + + const size_t size = bytes / sizeof(type); + + const size_t max_keys_size = *std::max_element(keys_sizes.begin(), keys_sizes.end()); + + // Generate data + std::vector key_input + = get_random_data(max_keys_size, 0, 100, seed.get_0()); + std::vector input + = get_random_data(size, 101, generate_limits::max(), seed.get_0()); + + std::vector d_inputs(first_occurrences.size()); + for(size_t fi = 0; fi < first_occurrences.size(); ++fi) + { + type* d_input; + HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); + HIP_CHECK(hipMemcpyAsync(d_input, + input.data(), + input.size() * sizeof(*d_input), + hipMemcpyHostToDevice, + stream)); + // Set the first occurrence of keys in input + const size_t p = static_cast(size * first_occurrences[fi]); + if(p < size) + { + const type key = key_input[0]; + HIP_CHECK(hipMemcpyAsync(d_input + p, + &key, + sizeof(*d_input), + hipMemcpyHostToDevice, + stream)); + } + d_inputs[fi] = d_input; + } + + key_type* d_key_input; + output_type* d_output; + HIP_CHECK(hipMalloc(&d_key_input, max_keys_size * sizeof(*d_key_input))); + HIP_CHECK(hipMalloc(&d_output, sizeof(*d_output))); + + HIP_CHECK(hipMemcpy(d_key_input, + key_input.data(), + key_input.size() * sizeof(*d_key_input), + hipMemcpyHostToDevice)); + + ::rocprim::equal_to compare_op; + + void* d_temporary_storage = nullptr; + size_t temporary_storage_bytes = 0; + + auto run = [&](size_t key_size, const type* d_input) + { + HIP_CHECK(rocprim::find_first_of(d_temporary_storage, + temporary_storage_bytes, + d_input, + d_key_input, + d_output, + input.size(), + key_size, + compare_op, + stream)); + }; + + size_t max_temporary_storage_bytes = 0; + for(size_t keys_size : keys_sizes) + { + run(keys_size, d_inputs[0]); + max_temporary_storage_bytes + = std::max(max_temporary_storage_bytes, temporary_storage_bytes); + } + temporary_storage_bytes = max_temporary_storage_bytes; + HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); + + // Warm-up + for(size_t i = 0; i < warmup_size; i++) + { + for(size_t fi = 0; fi < first_occurrences.size(); ++fi) + { + for(size_t keys_size : keys_sizes) + { + run(keys_size, d_inputs[fi]); + } + } + } + HIP_CHECK(hipDeviceSynchronize()); + + // HIP events creation + hipEvent_t start, stop; + HIP_CHECK(hipEventCreate(&start)); + HIP_CHECK(hipEventCreate(&stop)); + + for(auto _ : state) + { + // Record start event + HIP_CHECK(hipEventRecord(start, stream)); + + for(size_t i = 0; i < batch_size; i++) + { + for(size_t fi = 0; fi < first_occurrences.size(); ++fi) + { + for(size_t keys_size : keys_sizes) + { + run(keys_size, d_inputs[fi]); + } + } + } + + // Record stop event and wait until it completes + HIP_CHECK(hipEventRecord(stop, stream)); + HIP_CHECK(hipEventSynchronize(stop)); + + float elapsed_mseconds; + HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); + state.SetIterationTime(elapsed_mseconds / 1000); + } + + // Destroy HIP events + HIP_CHECK(hipEventDestroy(start)); + HIP_CHECK(hipEventDestroy(stop)); + + // Only a part of data (before the first occurrence) must be actually processed. In ideal + // cases when no thread blocks do unneeded work (i.e. exit early once the match is found), + // performance for different values of first_occurrence must be similar. + size_t sum_effective_size = 0; + for(double first_occurrence : first_occurrences) + { + sum_effective_size += static_cast(size * first_occurrence); + } + size_t sum_keys_size = 0; + for(size_t keys_size : keys_sizes) + { + sum_keys_size += keys_size; + } + state.SetBytesProcessed(state.iterations() * batch_size * sum_effective_size + * sizeof(*d_inputs[0])); + state.SetItemsProcessed(state.iterations() * batch_size * sum_effective_size); + // Each input is read once but all keys are read by all threads so performance is likely + // compute-bound or bound by cache bandwidth for reading keys rather than reading inputs. + // Let's additionally report the rate of comparisons to see if it reaches a plateau with + // increasing keys_size. + state.counters["comparisons_per_second"] + = benchmark::Counter(static_cast(state.iterations() * batch_size + * sum_effective_size * sum_keys_size), + benchmark::Counter::kIsRate); + + for(size_t fi = 0; fi < first_occurrences.size(); ++fi) + { + HIP_CHECK(hipFree(d_inputs[fi])); + } + HIP_CHECK(hipFree(d_key_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_temporary_storage)); + } +}; + +template +struct device_find_first_of_benchmark_generator +{ + + template + struct create_ipt + { + using generated_config = rocprim::find_first_of_config; + + void operator()(std::vector>& storage) + { + std::vector keys_sizes{1, 10, 100, 1000}; + std::vector first_occurrences{0.1, 0.5, 1.0}; + storage.emplace_back( + std::make_unique>( + keys_sizes, + first_occurrences)); + } + }; + + static void create(std::vector>& storage) + { + static constexpr unsigned int min_items_per_thread = 1; + static constexpr unsigned int max_items_per_thread = 16; + static_for_each, + create_ipt>(storage); + } +}; + +#endif // ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ diff --git a/rocprim/include/rocprim/device/detail/config/device_find_first_of.hpp b/rocprim/include/rocprim/device/detail/config/device_find_first_of.hpp new file mode 100644 index 000000000..ce06277a8 --- /dev/null +++ b/rocprim/include/rocprim/device/detail/config/device_find_first_of.hpp @@ -0,0 +1,263 @@ +// Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_FIND_FIRST_OF_HPP_ +#define ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_FIND_FIRST_OF_HPP_ + +#include "../../../type_traits.hpp" +#include "../device_config_helper.hpp" + +#include + +/* DO NOT EDIT THIS FILE + * This file is automatically generated by `/scripts/autotune/create_optimization.py`. + * so most likely you want to edit rocprim/device/device_(algo)_config.hpp + */ + +/// \addtogroup primitivesmodule_deviceconfigs +/// @{ + +BEGIN_ROCPRIM_NAMESPACE + +namespace detail +{ + +template +struct default_find_first_of_config : default_find_first_of_config_base::type +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<256, 12> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1030), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 12> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx1030), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<64, 15> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1100), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 9> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1100), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<128, 13> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx1100), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 9> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx1100), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<64, 13> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 15> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<1024, 14> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx906), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<64, 13> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx906), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<256, 11> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 8> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx908), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 11> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx908), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 8> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::unknown), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 11> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::unknown), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = int64_t +template +struct default_find_first_of_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<((sizeof(value_type) <= 8) && (sizeof(value_type) > 4))>> + : find_first_of_config<256, 8> +{}; + +// Based on value_type = int +template +struct default_find_first_of_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<((sizeof(value_type) <= 4) && (sizeof(value_type) > 2))>> + : find_first_of_config<256, 10> +{}; + +// Based on value_type = short +template +struct default_find_first_of_config< + static_cast(target_arch::gfx90a), + value_type, + std::enable_if_t<((sizeof(value_type) <= 2) && (sizeof(value_type) > 1))>> + : find_first_of_config<256, 11> +{}; + +// Based on value_type = int8_t +template +struct default_find_first_of_config(target_arch::gfx90a), + value_type, + std::enable_if_t<((sizeof(value_type) <= 1))>> + : find_first_of_config<256, 10> +{}; + +} // end namespace detail + +END_ROCPRIM_NAMESPACE + +/// @} +// end of group primitivesmodule_deviceconfigs + +#endif // ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_FIND_FIRST_OF_HPP_ diff --git a/rocprim/include/rocprim/device/detail/device_config_helper.hpp b/rocprim/include/rocprim/device/detail/device_config_helper.hpp index c539704c5..095f4f50f 100644 --- a/rocprim/include/rocprim/device/detail/device_config_helper.hpp +++ b/rocprim/include/rocprim/device/detail/device_config_helper.hpp @@ -1087,7 +1087,7 @@ namespace detail struct find_first_of_config_params { - kernel_config_params kernel_config; + kernel_config_params kernel_config{}; }; } // namespace detail @@ -1102,12 +1102,26 @@ struct find_first_of_config : public detail::find_first_of_config_params #ifndef DOXYGEN_SHOULD_SKIP_THIS constexpr find_first_of_config() : detail::find_first_of_config_params{ - {BlockSize, ItemsPerThread, ROCPRIM_GRID_SIZE_LIMIT} + {BlockSize, ItemsPerThread, 0} } {} #endif }; +namespace detail +{ + +template +struct default_find_first_of_config_base +{ + static constexpr unsigned int item_scale + = ::rocprim::detail::ceiling_div(sizeof(Value), sizeof(int)); + + using type = find_first_of_config<256, ::rocprim::max(1u, 16u / item_scale)>; +}; + +} // namespace detail + END_ROCPRIM_NAMESPACE /// @} diff --git a/rocprim/include/rocprim/device/device_find_first_of_config.hpp b/rocprim/include/rocprim/device/device_find_first_of_config.hpp index 7996329f5..26c0a30d7 100644 --- a/rocprim/include/rocprim/device/device_find_first_of_config.hpp +++ b/rocprim/include/rocprim/device/device_find_first_of_config.hpp @@ -22,7 +22,7 @@ #define ROCPRIM_DEVICE_DEVICE_FIND_FIRST_OF_CONFIG_HPP_ #include "config_types.hpp" - +#include "detail/config/device_find_first_of.hpp" #include "detail/device_config_helper.hpp" /// \addtogroup primitivesmodule_deviceconfigs @@ -51,7 +51,8 @@ struct wrapped_find_first_of_config template struct architecture_config { - static constexpr find_first_of_config_params params = {kernel_config<256, 4>()}; + static constexpr find_first_of_config_params params + = default_find_first_of_config(Arch), Type>(); }; }; diff --git a/scripts/autotune/create_optimization.py b/scripts/autotune/create_optimization.py index 130bdb3cf..2bebf1fd5 100755 --- a/scripts/autotune/create_optimization.py +++ b/scripts/autotune/create_optimization.py @@ -622,6 +622,14 @@ class AlgorithmDeviceReduceByKey(Algorithm): def __init__(self, fallback_entries): Algorithm.__init__(self, fallback_entries) +class AlgorithmDeviceFindFirstOf(Algorithm): + algorithm_name = "device_find_first_of" + cpp_configuration_template_name = "find_first_of_config_template" + config_selection_params = [ + SelectionType(name="value_type", is_optional=False, select_on_size_only=True)] + def __init__(self, fallback_entries): + Algorithm.__init__(self, fallback_entries) + def filt_algo_regex(e: FallbackCase, algorithm_name): if e.algo_regex: return re.match(e.algo_regex, algorithm_name) is not None @@ -679,6 +687,8 @@ def create_algorithm(algorithm_name: str, fallback_entries: List[FallbackCase]): return AlgorithmDeviceSelectUniqueByKey(fallback_entries) elif algorithm_name == 'device_reduce_by_key': return AlgorithmDeviceReduceByKey(fallback_entries) + elif algorithm_name == 'device_find_first_of': + return AlgorithmDeviceFindFirstOf(fallback_entries) else: raise(NotSupportedError(f'Algorithm "{algorithm_name}" is not supported (yet)')) diff --git a/scripts/autotune/templates/find_first_of_config_template b/scripts/autotune/templates/find_first_of_config_template new file mode 100644 index 000000000..84e7fb476 --- /dev/null +++ b/scripts/autotune/templates/find_first_of_config_template @@ -0,0 +1,20 @@ +{% extends "config_template" %} + +{% macro get_header_guard() %} +ROCPRIM_DEVICE_DETAIL_CONFIG_DEVICE_FIND_FIRST_OF_HPP_ +{%- endmacro %} + +{% macro kernel_configuration(measurement) -%} +find_first_of_config<{{ measurement['cfg']['bs'] }}, {{ measurement['cfg']['ipt'] }}> { }; +{%- endmacro %} + +{% macro general_case() -%} +template +struct default_find_first_of_config : default_find_first_of_config_base::type +{}; +{%- endmacro %} + +{% macro configuration_fallback(benchmark_of_architecture, based_on_type, fallback_selection_criteria) -%} +// Based on {{ based_on_type }} +template struct default_find_first_of_config({{ benchmark_of_architecture.name }}), value_type, {{ fallback_selection_criteria }}> : +{%- endmacro %}