From 7580bdd7e53aa2c2ca2f427e5595b94d675a061a Mon Sep 17 00:00:00 2001 From: Nara Prasetya Date: Wed, 16 Aug 2023 08:17:44 +0000 Subject: [PATCH] Implement block run length decode --- CHANGELOG.md | 1 + benchmark/CMakeLists.txt | 1 + .../benchmark_block_run_length_decode.cpp | 242 ++++++++++++ .../rocprim/block/block_run_length_decode.hpp | 367 ++++++++++++++++++ rocprim/include/rocprim/rocprim.hpp | 3 +- .../include/rocprim/thread/thread_search.hpp | 76 ++-- test/rocprim/CMakeLists.txt | 1 + test/rocprim/test_block_run_length_decode.cpp | 235 +++++++++++ test/rocprim/test_utils_data_generation.hpp | 25 +- 9 files changed, 921 insertions(+), 30 deletions(-) create mode 100644 benchmark/benchmark_block_run_length_decode.cpp create mode 100644 rocprim/include/rocprim/block/block_run_length_decode.hpp create mode 100644 test/rocprim/test_block_run_length_decode.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d0573522..7416f6fda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Full documentation for rocPRIM is available at [https://rocprim.readthedocs.io/e ## [Unreleased rocPRIM-3.0.0 for ROCm 6.0.0] ### Added + - Added new primitive: `block_run_length_decode`. ### Changed - Removed deprecated functionality: `reduce_by_key_config`, `MatchAny`, `scan_config`, `scan_by_key_config` and `radix_sort_config`. - Renamed `scan_config_v2` to `scan_config`, `scan_by_key_config_v2` to `scan_by_key_config`, `radix_sort_config_v2` to `radix_sort_config`, `reduce_by_key_config_v2` to `reduce_by_key_config`, `radix_sort_config_v2` to `radix_sort_config`. diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index c65de966c..8087b43ff 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -123,6 +123,7 @@ add_rocprim_benchmark(benchmark_block_histogram.cpp) add_rocprim_benchmark(benchmark_block_radix_sort.cpp) add_rocprim_benchmark(benchmark_block_radix_rank.cpp) add_rocprim_benchmark(benchmark_block_reduce.cpp) +add_rocprim_benchmark(benchmark_block_run_length_decode.cpp) add_rocprim_benchmark(benchmark_block_scan.cpp) add_rocprim_benchmark(benchmark_block_sort.cpp) add_rocprim_benchmark(benchmark_config_dispatch.cpp) diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp new file mode 100644 index 000000000..04e1f0428 --- /dev/null +++ b/benchmark/benchmark_block_run_length_decode.cpp @@ -0,0 +1,242 @@ +// MIT License +// +// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "benchmark/benchmark.h" +#include "benchmark_utils.hpp" +#include "cmdparser.hpp" + +#include "rocprim/block/block_load.hpp" +#include "rocprim/block/block_run_length_decode.hpp" +#include "rocprim/block/block_store.hpp" + +#include +#include + +#ifndef DEFAULT_N +const size_t DEFAULT_N = 1024 * 1024 * 32; +#endif + +template +__global__ + __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT* d_run_items, + const OffsetT* d_run_offsets, + ItemT* d_decoded_items, + bool enable_store = false) +{ + using BlockRunLengthDecodeT + = rocprim::block_run_length_decode; + + ItemT run_items[RunsPerThread]; + OffsetT run_offsets[RunsPerThread]; + + const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; + rocprim::block_load_direct_blocked(global_thread_idx, d_run_items, run_items); + rocprim::block_load_direct_blocked(global_thread_idx, d_run_offsets, run_offsets); + + ROCPRIM_SHARED_MEMORY typename BlockRunLengthDecodeT::storage_type temp_storage; + BlockRunLengthDecodeT block_run_length_decode(run_items, run_offsets); + + const OffsetT total_decoded_size + = d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] + - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; + +#pragma nounroll + for(unsigned i = 0; i < Trials; ++i) + { + OffsetT decoded_window_offset = 0; + while(decoded_window_offset < total_decoded_size) + { + ItemT decoded_items[DecodedItemsPerThread]; + block_run_length_decode.run_length_decode(decoded_items, decoded_window_offset); + + if(enable_store) + { + rocprim::block_store_direct_blocked(global_thread_idx, + d_decoded_items + decoded_window_offset, + decoded_items); + } + + decoded_window_offset += BlockSize * DecodedItemsPerThread; + } + } +} + +template +void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) +{ + constexpr auto runs_per_block = BlockSize * RunsPerThread; + const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength); + const auto num_runs + = runs_per_block * ((target_num_runs + runs_per_block - 1) / runs_per_block); + + std::vector run_items(num_runs); + std::vector run_offsets(num_runs + 1); + + std::default_random_engine prng(std::random_device{}()); + using ItemDistribution = std::conditional_t::value, + std::uniform_int_distribution, + std::uniform_real_distribution>; + ItemDistribution run_item_dist(0, 100); + std::uniform_int_distribution run_length_dist(MinRunLength, MaxRunLength); + + for(size_t i = 0; i < num_runs; ++i) + { + run_items[i] = run_item_dist(prng); + } + for(size_t i = 1; i < num_runs + 1; ++i) + { + const OffsetT next_run_length = run_length_dist(prng); + run_offsets[i] = run_offsets[i - 1] + next_run_length; + } + const OffsetT output_length = run_offsets.back(); + + ItemT* d_run_items{}; + HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT))); + HIP_CHECK(hipMemcpy(d_run_items, + run_items.data(), + run_items.size() * sizeof(ItemT), + hipMemcpyHostToDevice)); + + OffsetT* d_run_offsets{}; + HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT))); + HIP_CHECK(hipMemcpy(d_run_offsets, + run_offsets.data(), + run_offsets.size() * sizeof(OffsetT), + hipMemcpyHostToDevice)); + + ItemT* d_output{}; + HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT))); + + for(auto _ : state) + { + auto start = std::chrono::high_resolution_clock::now(); + hipLaunchKernelGGL(HIP_KERNEL_NAME(block_run_length_decode_kernel), + dim3(num_runs / runs_per_block), + dim3(BlockSize), + 0, + stream, + d_run_items, + d_run_offsets, + d_output); + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + auto end = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds + = std::chrono::duration_cast>(end - start); + + state.SetIterationTime(elapsed_seconds.count()); + } + state.SetBytesProcessed(state.iterations() * output_length * sizeof(ItemT) * Trials); + state.SetItemsProcessed(state.iterations() * output_length * Trials); + + HIP_CHECK(hipFree(d_run_items)); + HIP_CHECK(hipFree(d_run_offsets)); + HIP_CHECK(hipFree(d_output)); +} + +#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ + benchmark::RegisterBenchmark("block_run_length_decode", \ + &run_benchmark, \ + stream, \ + size) + +int main(int argc, char* argv[]) +{ + cli::Parser parser(argc, argv); + parser.set_optional("size", "size", DEFAULT_N, "number of values"); + parser.set_optional("trials", "trials", -1, "number of iterations"); + parser.run_and_exit_if_error(); + + // Parse argv + benchmark::Initialize(&argc, argv); + const size_t size = parser.get("size"); + const int trials = parser.get("trials"); + + std::cout << "benchmark_block_run_length_decode" << std::endl; + + // HIP + hipStream_t stream = 0; // default + hipDeviceProp_t devProp; + int device_id = 0; + HIP_CHECK(hipGetDevice(&device_id)); + HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); + std::cout << "[HIP] Device name: " << devProp.name << std::endl; + + // Add benchmarks + std::vector benchmarks{ + CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 100, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 500, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 1000, 128, 2, 4), + CREATE_BENCHMARK(int, int, 1, 5000, 128, 2, 4), + + CREATE_BENCHMARK(double, long long, 1, 5, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 10, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 50, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4), + CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4)}; + + // Use manual timing + for(auto& b : benchmarks) + { + b->UseManualTime(); + b->Unit(benchmark::kMillisecond); + } + + // Force number of iterations + if(trials > 0) + { + for(auto& b : benchmarks) + { + b->Iterations(trials); + } + } + + // Run benchmarks + benchmark::RunSpecifiedBenchmarks(); + return 0; +} diff --git a/rocprim/include/rocprim/block/block_run_length_decode.hpp b/rocprim/include/rocprim/block/block_run_length_decode.hpp new file mode 100644 index 000000000..a4b6ce93d --- /dev/null +++ b/rocprim/include/rocprim/block/block_run_length_decode.hpp @@ -0,0 +1,367 @@ +/****************************************************************************** + * Copyright (c) 2010-2011, Duane Merrill. All rights reserved. + * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2021-2023, Advanced Micro Devices, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +#ifndef ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_ +#define ROCPRIM_BLOCK_BLOCK_RUN_LENGTH_DECODE_HPP_ + +#include "../block/block_scan.hpp" +#include "../config.hpp" +#include "../detail/temp_storage.hpp" +#include "../detail/various.hpp" +#include "../functional.hpp" +#include "../intrinsics/thread.hpp" +#include "../thread/thread_search.hpp" + +BEGIN_ROCPRIM_NAMESPACE + +/** + * \brief The block_run_length_decode class supports decoding a run-length encoded array of items. That is, given + * the two arrays run_value[N] and run_lengths[N], run_value[i] is repeated run_lengths[i] many times in the output + * array. + * Due to the nature of the run-length decoding algorithm ("decompression"), the output size of the run-length decoded + * array is runtime-dependent and potentially without any upper bound. To address this, block_run_length_decode allows + * retrieving a "window" from the run-length decoded array. The window's offset can be specified and BLOCK_THREADS * + * DECODED_ITEMS_PER_THREAD (i.e., referred to as window_size) decoded items from the specified window will be returned. + * + * \note: Trailing runs of length 0 are supported (i.e., they may only appear at the end of the run_lengths array). + * A run of length zero may not be followed by a run length that is not zero. + * + * \par + * \code + * __global__ void ExampleKernel(...) + * { + * // Specialising block_run_length_decode to run-length decode items of type uint64_t + * using RunItemT = uint64_t; + * // Type large enough to index into the run-length decoded array + * using RunLengthT = uint32_t; + * + * // Specialising block_run_length_decode for a 1D block of 128 threads + * constexpr int BLOCK_DIM_X = 128; + * // Specialising block_run_length_decode to have each thread contribute 2 run-length encoded runs + * constexpr int RUNS_PER_THREAD = 2; + * // Specialising block_run_length_decode to have each thread hold 4 run-length decoded items + * constexpr int DECODED_ITEMS_PER_THREAD = 4; + * + * // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each + * using block_run_length_decodeT = + * hipcub::block_run_length_decode; + * + * // Allocate shared memory for block_run_length_decode + * __shared__ typename block_run_length_decodeT::TempStorage temp_storage; + * + * // The run-length encoded items and how often they shall be repeated in the run-length decoded output + * RunItemT run_values[RUNS_PER_THREAD]; + * RunLengthT run_lengths[RUNS_PER_THREAD]; + * ... + * + * // Initialize the block_run_length_decode with the runs that we want to run-length decode + * uint32_t total_decoded_size = 0; + * block_run_length_decodeT block_rld(temp_storage, run_values, run_lengths, total_decoded_size); + * + * // Run-length decode ("decompress") the runs into a window buffer of limited size. This is repeated until all runs + * // have been decoded. + * uint32_t decoded_window_offset = 0U; + * while (decoded_window_offset < total_decoded_size) + * { + * RunLengthT relative_offsets[DECODED_ITEMS_PER_THREAD]; + * RunItemT decoded_items[DECODED_ITEMS_PER_THREAD]; + * + * // The number of decoded items that are valid within this window (aka pass) of run-length decoding + * uint32_t num_valid_items = total_decoded_size - decoded_window_offset; + * block_rld.run_length_decode(decoded_items, relative_offsets, decoded_window_offset); + * + * decoded_window_offset += BLOCK_DIM_X * DECODED_ITEMS_PER_THREAD; + * + * ... + * } + * } + * \endcode + * \par + * Suppose the set of input \p run_values across the block of threads is + * { [0, 1], [2, 3], [4, 5], [6, 7], ..., [254, 255] } and + * \p run_lengths is { [1, 2], [3, 4], [5, 1], [2, 3], ..., [5, 1] }. + * The corresponding output \p decoded_items in those threads will be { [0, 1, 1, 2], [2, 2, 3, 3], [3, 3, 4, 4], + * [4, 4, 4, 5], ..., [169, 169, 170, 171] } and \p relative_offsets will be { [0, 0, 1, 0], [1, 2, 0, 1], [2, + * 3, 0, 1], [2, 3, 4, 0], ..., [3, 4, 0, 0] } during the first iteration of the while loop. + * + * \tparam ItemT The data type of the items being run-length decoded + * \tparam BLOCK_DIM_X The thread block length in threads along the X dimension + * \tparam RUNS_PER_THREAD The number of consecutive runs that each thread contributes + * \tparam DECODED_ITEMS_PER_THREAD The maximum number of decoded items that each thread holds + * \tparam DecodedOffsetT Type used to index into the block's decoded items (large enough to hold the sum over all the + * runs' lengths) + * \tparam BLOCK_DIM_Y The thread block length in threads along the Y dimension + * \tparam BLOCK_DIM_Z The thread block length in threads along the Z dimension + */ +template +class block_run_length_decode +{ +private: + /// The thread block size in threads + static constexpr int BLOCK_THREADS = BlockSizeX * BlockSizeY * BlockSizeZ; + + /// The number of runs that the block decodes (out-of-bounds items may be padded with run lengths of '0') + static constexpr int BLOCK_RUNS = BLOCK_THREADS * RUNS_PER_THREAD; + + /// block_scan used to determine the beginning of each run (i.e., prefix sum over the runs' length) + using block_scan_type = rocprim::block_scan; + + /// Type used to index into the block's runs + using RunOffsetT = uint32_t; + + /// Shared memory type required by this thread block + union storage_type_ + { + typename block_scan_type::storage_type offset_scan; + struct + { + ItemT run_values[BLOCK_RUNS]; + DecodedOffsetT run_offsets[BLOCK_RUNS]; + } runs; + }; + + ROCPRIM_DEVICE ROCPRIM_INLINE storage_type_& private_storage() + { + ROCPRIM_SHARED_MEMORY storage_type private_storage; + return private_storage.get(); + } + + storage_type_& temp_storage; + + uint32_t linear_tid; + +public: + using storage_type = detail::raw_storage; + + /** + * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' lengths. The + * algorithm's temporary storage may not be repurposed between the constructor call and subsequent + * run_length_decode calls. + */ + template + ROCPRIM_DEVICE ROCPRIM_INLINE + block_run_length_decode(storage_type& temp_storage, + ItemT (&run_values)[RUNS_PER_THREAD], + RunLengthT (&run_lengths)[RUNS_PER_THREAD], + TotalDecodedSizeT& total_decoded_size) + : temp_storage(temp_storage.get()) + , linear_tid(::rocprim::flat_block_thread_id()) + { + init_with_run_lengths(run_values, run_lengths, total_decoded_size); + } + + /** + * \brief Constructor specialised for user-provided temporary storage, initializing using the runs' offsets. The + * algorithm's temporary storage may not be repurposed between the constructor call and subsequent + * run_length_decode calls. + */ + template + ROCPRIM_DEVICE ROCPRIM_INLINE + block_run_length_decode(storage_type& temp_storage, + ItemT (&run_values)[RUNS_PER_THREAD], + UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD]) + : temp_storage(temp_storage.get()) + , linear_tid(::rocprim::flat_block_thread_id()) + { + init_with_run_offsets(run_values, run_offsets); + } + + /** + * \brief Constructor specialised for static temporary storage, initializing using the runs' lengths. + */ + template + ROCPRIM_DEVICE ROCPRIM_INLINE + block_run_length_decode(ItemT (&run_values)[RUNS_PER_THREAD], + RunLengthT (&run_lengths)[RUNS_PER_THREAD], + TotalDecodedSizeT& total_decoded_size) + : temp_storage(private_storage()) + , linear_tid(::rocprim::flat_block_thread_id()) + { + init_with_run_lengths(run_values, run_lengths, total_decoded_size); + } + + /** + * \brief Constructor specialised for static temporary storage, initializing using the runs' offsets. + */ + template + ROCPRIM_DEVICE ROCPRIM_INLINE + block_run_length_decode(ItemT (&run_values)[RUNS_PER_THREAD], + UserRunOffsetT (&run_offsets)[RUNS_PER_THREAD]) + : temp_storage(private_storage()) + , linear_tid(::rocprim::flat_block_thread_id()) + { + init_with_run_offsets(run_values, run_offsets); + } + +private: + template + ROCPRIM_DEVICE ROCPRIM_INLINE void + init_with_run_offsets(ItemT (&run_values)[RUNS_PER_THREAD], + RunOffsetT (&run_offsets)[RUNS_PER_THREAD]) + { + // Keep the runs' items and the offsets of each run's beginning in the temporary storage + RunOffsetT thread_dst_offset + = static_cast(linear_tid) * static_cast(RUNS_PER_THREAD); + +#pragma unroll + for(int i = 0; i < RUNS_PER_THREAD; ++i, ++thread_dst_offset) + { + temp_storage.runs.run_values[thread_dst_offset] = run_values[i]; + temp_storage.runs.run_offsets[thread_dst_offset] = run_offsets[i]; + } + + // Ensure run offsets and run values have been writen to shared memory + syncthreads(); + } + + template + ROCPRIM_DEVICE ROCPRIM_INLINE void + init_with_run_lengths(ItemT (&run_values)[RUNS_PER_THREAD], + RunLengthT (&run_lengths)[RUNS_PER_THREAD], + TotalDecodedSizeT& total_decoded_size) + { + // Compute the offset for the beginning of each run + DecodedOffsetT run_offsets[RUNS_PER_THREAD]; +#pragma unroll + for(int i = 0; i < RUNS_PER_THREAD; ++i) + { + run_offsets[i] = static_cast(run_lengths[i]); + } + + DecodedOffsetT decoded_size_aggregate{}; + block_scan_type().exclusive_scan(run_offsets, + run_offsets, + 0, + decoded_size_aggregate, + temp_storage.offset_scan, + rocprim::plus{}); + total_decoded_size = static_cast(decoded_size_aggregate); + + // Ensure the prefix scan's temporary storage can be reused (may be superfluous, but depends on scan implementation) + syncthreads(); + + init_with_run_offsets(run_values, run_offsets); + } + +public: + /** + * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded + * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the + * run-length decode buffer (i.e., DECODED_ITEMS_PER_THREAD * BLOCK_THREADS), only the items that fit within + * the buffer are returned. Subsequent calls to run_length_decode adjusting \p from_decoded_offset can be + * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to + * run_length_decode is not required. + * \p item_offsets can be used to retrieve each run-length decoded item's relative index within its run. E.g., the + * run-length encoded array of `3, 1, 4` with the respective run lengths of `2, 1, 3` would yield the run-length + * decoded array of `3, 3, 1, 4, 4, 4` with the relative offsets of `0, 1, 0, 0, 1, 2`. + * \smemreuse + * + * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement + * \param[out] item_offsets The run-length decoded items' relative offset within the run they belong to + * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results + * in undefined behavior. + */ + template + ROCPRIM_DEVICE ROCPRIM_INLINE void + run_length_decode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], + RelativeOffsetT (&item_offsets)[DECODED_ITEMS_PER_THREAD], + DecodedOffsetT from_decoded_offset = 0) + { + // The (global) offset of the first item decoded by this thread + DecodedOffsetT thread_decoded_offset + = from_decoded_offset + linear_tid * DECODED_ITEMS_PER_THREAD; + + // The run that the first decoded item of this thread belongs to + // If this thread's is already beyond the total decoded size, it will be assigned to the + // last run + RunOffsetT current_run + = rocprim::static_upper_bound(temp_storage.runs.run_offsets, + BLOCK_RUNS, + thread_decoded_offset) + - static_cast(1U); + + // Set the current_run_end to thread_decoded_offset to trigger new run branch in the first iteration + DecodedOffsetT current_run_begin, current_run_end = thread_decoded_offset; + + ItemT val{}; + +#pragma unroll + for(DecodedOffsetT i = 0; i < DECODED_ITEMS_PER_THREAD; ++i, ++thread_decoded_offset) + { + // If we are in a new run... + if(thread_decoded_offset == current_run_end) + { + // The value of the new run + val = temp_storage.runs.run_values[current_run]; + + // The run bounds + current_run_begin = thread_decoded_offset; + current_run_end = temp_storage.runs.run_offsets[++current_run]; + } + + // Decode the current run by storing the run's value + decoded_items[i] = val; + item_offsets[i] = thread_decoded_offset - current_run_begin; + } + } + + /** + * \brief Run-length decodes the runs previously passed via a call to Init(...) and returns the run-length decoded + * items in a blocked arrangement to \p decoded_items. If the number of run-length decoded items exceeds the + * run-length decode buffer (i.e., DECODED_ITEMS_PER_THREAD * BLOCK_THREADS), only the items that fit within + * the buffer are returned. Subsequent calls to run_length_decode adjusting \p from_decoded_offset can be + * used to retrieve the remaining run-length decoded items. Calling __syncthreads() between any two calls to + * run_length_decode is not required. + * + * \param[out] decoded_items The run-length decoded items to be returned in a blocked arrangement + * \param[in] from_decoded_offset If invoked with from_decoded_offset that is larger than total_decoded_size results + * in undefined behavior. + */ + ROCPRIM_DEVICE ROCPRIM_INLINE void + run_length_decode(ItemT (&decoded_items)[DECODED_ITEMS_PER_THREAD], + DecodedOffsetT from_decoded_offset = 0) + { + DecodedOffsetT item_offsets[DECODED_ITEMS_PER_THREAD]; + run_length_decode(decoded_items, item_offsets, from_decoded_offset); + } +}; + +END_ROCPRIM_NAMESPACE + +#endif diff --git a/rocprim/include/rocprim/rocprim.hpp b/rocprim/include/rocprim/rocprim.hpp index c2b587f5b..6a2ecabf8 100644 --- a/rocprim/include/rocprim/rocprim.hpp +++ b/rocprim/include/rocprim/rocprim.hpp @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal @@ -45,6 +45,7 @@ #include "block/block_histogram.hpp" #include "block/block_load.hpp" #include "block/block_radix_sort.hpp" +#include "block/block_run_length_decode.hpp" #include "block/block_scan.hpp" #include "block/block_sort.hpp" #include "block/block_store.hpp" diff --git a/rocprim/include/rocprim/thread/thread_search.hpp b/rocprim/include/rocprim/thread/thread_search.hpp index 54fe38d25..c6b6e7b03 100644 --- a/rocprim/include/rocprim/thread/thread_search.hpp +++ b/rocprim/include/rocprim/thread/thread_search.hpp @@ -75,33 +75,24 @@ ROCPRIM_HOST_DEVICE inline void merge_path_search( path_coordinate.y = diagonal - split_min; } - - - /// \brief Returns the offset of the first value within \p input which does not compare less than \p val /// \tparam InputIteratorT - [inferred] Type of iterator for the input data to be searched /// \tparam OffsetT - [inferred] The data type of num_items /// \tparam T - [inferred] The data type of the input sequence elements /// \param input [in] - Input sequence -/// \param num_items [out] - Input sequence length +/// \param num_items [in] - Input sequence length /// \param val [in] - Search Key /// \return - Offset at which val was found -template < - typename InputIteratorT, - typename OffsetT, - typename T> -ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT lower_bound( - InputIteratorT input, - OffsetT num_items, - T val) +template +ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT lower_bound(InputIteratorT input, OffsetT num_items, T val) { OffsetT retval = 0; - while (num_items > 0) + while(num_items > 0) { OffsetT half = num_items >> 1; - if (input[retval + half] < val) + if(input[retval + half] < val) { - retval = retval + (half + 1); + retval = retval + (half + 1); num_items = num_items - (half + 1); } else @@ -113,35 +104,28 @@ ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT lower_bound( return retval; } - /// \brief Returns the offset of the first value within \p input which compares greater than \p val /// \tparam InputIteratorT - [inferred] Type of iterator for the input data to be searched /// \tparam OffsetT - [inferred] The data type of num_items /// \tparam T - [inferred] The data type of the input sequence elements /// \param input [in] - Input sequence -/// \param num_items [out] - Input sequence length +/// \param num_items [in] - Input sequence length /// \param val [in] - Search Key /// \return - Offset at which val was found -template < - typename InputIteratorT, - typename OffsetT, - typename T> -ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT upper_bound( - InputIteratorT input, ///< [in] Input sequence - OffsetT num_items, ///< [in] Input sequence length - T val) ///< [in] Search key +template +ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT upper_bound(InputIteratorT input, OffsetT num_items, T val) { OffsetT retval = 0; - while (num_items > 0) + while(num_items > 0) { OffsetT half = num_items >> 1; - if (val < input[retval + half]) + if(val < input[retval + half]) { num_items = half; } else { - retval = retval + (half + 1); + retval = retval + (half + 1); num_items = num_items - (half + 1); } } @@ -149,6 +133,42 @@ ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT upper_bound( return retval; } +/// \brief Returns the offset of the first value within \p input which compares greater than \p val +/// computed as a statically unrolled loop +/// \tparam MaxNumItems - The maximum number of items. +/// \tparam InputIteratorT - [inferred] Type of iterator for the input data to be searched +/// \tparam OffsetT - [inferred] The data type of num_items +/// \tparam T - [inferred] The data type of the input sequence elements +/// \param input [in] - Input sequence +/// \param num_items [in] - Input sequence length +/// \param val [in] - Search Key +/// \return - Offset at which val was found +template +ROCPRIM_DEVICE ROCPRIM_INLINE OffsetT static_upper_bound(InputIteratorT input, + OffsetT num_items, + T val) +{ + OffsetT lower_bound = 0; + OffsetT upper_bound = num_items; +#pragma unroll + for(int i = 0; i <= Log2::VALUE; i++) + { + OffsetT mid = lower_bound + (upper_bound - lower_bound) / 2; + mid = rocprim::min(mid, num_items - 1); + + if(val < input[mid]) + { + upper_bound = mid; + } + else + { + lower_bound = mid + 1; + } + } + + return lower_bound; +} + END_ROCPRIM_NAMESPACE #endif // ROCPRIM_THREAD_THREAD_SCAN_HPP_ diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt index 78e039a95..8db2d5c5b 100644 --- a/test/rocprim/CMakeLists.txt +++ b/test/rocprim/CMakeLists.txt @@ -234,6 +234,7 @@ add_rocprim_test("rocprim.block_sort_merge_stable" test_block_sort_merge_stable. add_rocprim_test_parallel("rocprim.block_radix_rank" test_block_radix_rank.cpp.in) add_rocprim_test("rocprim.block_radix_sort" test_block_radix_sort.cpp) add_rocprim_test("rocprim.block_reduce" test_block_reduce.cpp) +add_rocprim_test("rocprim.block_run_length_decode" test_block_run_length_decode.cpp) add_rocprim_test_parallel("rocprim.block_scan" test_block_scan.cpp.in) add_rocprim_test("rocprim.block_shuffle" test_block_shuffle.cpp) add_rocprim_test("rocprim.block_sort_bitonic" test_block_sort_bitonic.cpp) diff --git a/test/rocprim/test_block_run_length_decode.cpp b/test/rocprim/test_block_run_length_decode.cpp new file mode 100644 index 000000000..6fecc5501 --- /dev/null +++ b/test/rocprim/test_block_run_length_decode.cpp @@ -0,0 +1,235 @@ +// MIT License +// +// Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include "../common_test_header.hpp" + +// required rocprim headers +#include +#include +#include +#include + +// required test headers +#include "rocprim/block/block_load_func.hpp" +#include "rocprim/block/block_store_func.hpp" +#include "rocprim/functional.hpp" +#include "test_utils_types.hpp" + +template +struct Params +{ + using item_type = ItemT; + using length_type = LengthT; + static constexpr unsigned block_size = BlockSize; + static constexpr unsigned runs_per_thread = RunsPerThread; + static constexpr unsigned decoded_items_per_thread = DecodedItemsPerThread; +}; + +template +class HipcubBlockRunLengthDecodeTest : public ::testing::Test +{ +public: + using params = Params; +}; + +//using HipcubBlockRunLengthDecodeTestParams = ::testing::Types>; +using HipcubBlockRunLengthDecodeTestParams + = ::testing::Types, + Params, + Params, + Params, + Params, + Params, + + Params, + Params, + Params, + Params, + Params, + Params, + + Params, + Params, + Params, + Params, + Params, + Params, + + Params, + Params, + Params, + Params, + Params, + Params>; + +TYPED_TEST_SUITE(HipcubBlockRunLengthDecodeTest, HipcubBlockRunLengthDecodeTestParams); + +template +__global__ __launch_bounds__(BlockSize) void block_run_length_decode_kernel( + const ItemT* d_run_items, const LengthT* d_run_lengths, ItemT* d_decoded_items) +{ + using BlockRunLengthDecodeT + = rocprim::block_run_length_decode; + + static constexpr unsigned int decoded_items_per_block = BlockSize * DecodedItemsPerThread; + + ROCPRIM_SHARED_MEMORY typename BlockRunLengthDecodeT::storage_type temp_storage; + + ItemT run_items[RunsPerThread]; + LengthT run_lengths[RunsPerThread]; + + const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; + rocprim::block_load_direct_blocked(global_thread_idx, d_run_items, run_items); + rocprim::block_load_direct_blocked(global_thread_idx, d_run_lengths, run_lengths); + + unsigned total_decoded_size{}; + BlockRunLengthDecodeT block_run_length_decode(temp_storage, + run_items, + run_lengths, + total_decoded_size); + + unsigned decoded_window_offset = 0; + while(decoded_window_offset < total_decoded_size) + { + ItemT decoded_items[DecodedItemsPerThread]; + + block_run_length_decode.run_length_decode(decoded_items, decoded_window_offset); + rocprim::block_store_direct_blocked( + global_thread_idx, + d_decoded_items + decoded_window_offset, + decoded_items, + rocprim::minimum{}(total_decoded_size - decoded_window_offset, + decoded_items_per_block)); + + decoded_window_offset += decoded_items_per_block; + } +} + +TYPED_TEST(HipcubBlockRunLengthDecodeTest, TestDecode) +{ + int device_id = test_common_utils::obtain_device_from_ctest(); + SCOPED_TRACE(testing::Message() << "with device_id= " << device_id); + HIP_CHECK(hipSetDevice(device_id)); + + using ItemT = typename TestFixture::params::item_type; + using LengthT = typename TestFixture::params::length_type; + constexpr unsigned block_size = TestFixture::params::block_size; + constexpr unsigned runs_per_thread = TestFixture::params::runs_per_thread; + constexpr unsigned decoded_items_per_thread = TestFixture::params::decoded_items_per_thread; + + for(size_t seed_index = 0; seed_index < random_seeds_count + seed_size; seed_index++) + { + const unsigned int seed_value + = seed_index < random_seeds_count ? rand() : seeds[seed_index - random_seeds_count]; + SCOPED_TRACE(testing::Message() << "with seed= " << seed_value); + + const LengthT max_run_length = static_cast( + std::min(1000ll, static_cast(std::numeric_limits::max()))); + + size_t num_runs = runs_per_thread * block_size; + auto run_items = test_utils::get_random_data(num_runs, + std::numeric_limits::min(), + std::numeric_limits::max(), + seed_value); + auto run_lengths = test_utils::get_random_data(num_runs, + static_cast(1), + max_run_length, + seed_value); + + std::default_random_engine prng(seed_value); + std::uniform_int_distribution num_empty_runs_dist(1, 4); + const size_t num_trailing_empty_runs = num_empty_runs_dist(prng); + num_runs += num_trailing_empty_runs; + + const auto empty_run_items + = test_utils::get_random_data(num_trailing_empty_runs, + std::numeric_limits::min(), + std::numeric_limits::max(), + seed_value); + run_items.insert(run_items.end(), empty_run_items.begin(), empty_run_items.end()); + run_lengths.insert(run_lengths.end(), num_trailing_empty_runs, static_cast(0)); + + std::vector expected; + for(size_t i = 0; i < run_items.size(); ++i) + { + for(size_t j = 0; j < static_cast(run_lengths[i]); ++j) + { + expected.push_back(run_items[i]); + } + } + + ItemT* d_run_items{}; + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_run_items, run_items.size() * sizeof(ItemT))); + HIP_CHECK(hipMemcpy(d_run_items, + run_items.data(), + run_items.size() * sizeof(ItemT), + hipMemcpyHostToDevice)); + + LengthT* d_run_lengths{}; + HIP_CHECK(test_common_utils::hipMallocHelper(&d_run_lengths, + run_lengths.size() * sizeof(LengthT))); + HIP_CHECK(hipMemcpy(d_run_lengths, + run_lengths.data(), + run_lengths.size() * sizeof(LengthT), + hipMemcpyHostToDevice)); + + ItemT* d_decoded_runs{}; + HIP_CHECK( + test_common_utils::hipMallocHelper(&d_decoded_runs, expected.size() * sizeof(ItemT))); + + block_run_length_decode_kernel + <<>>(d_run_items, d_run_lengths, d_decoded_runs); + + HIP_CHECK(hipPeekAtLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + std::vector output(expected.size()); + HIP_CHECK(hipMemcpy(output.data(), + d_decoded_runs, + output.size() * sizeof(ItemT), + hipMemcpyDeviceToHost)); + HIP_CHECK(hipGetLastError()) + + HIP_CHECK(hipFree(d_run_items)); + HIP_CHECK(hipFree(d_run_lengths)); + HIP_CHECK(hipFree(d_decoded_runs)); + + for(size_t i = 0; i < output.size(); ++i) + { + ASSERT_EQ(test_utils::convert_to_native(output[i]), + test_utils::convert_to_native(expected[i])); + } + } +} diff --git a/test/rocprim/test_utils_data_generation.hpp b/test/rocprim/test_utils_data_generation.hpp index 00621e48c..a299dd81a 100644 --- a/test/rocprim/test_utils_data_generation.hpp +++ b/test/rocprim/test_utils_data_generation.hpp @@ -23,10 +23,16 @@ // Std::memcpy and std::memcmp #include +#include -#include "test_utils_half.hpp" +#include +#include +#include + +#include "common_test_header.hpp" #include "test_utils_bfloat16.hpp" #include "test_utils_custom_test_types.hpp" +#include "test_utils_half.hpp" namespace test_utils { @@ -113,6 +119,23 @@ template<> class numeric_limits : public std::numeric_limi }; // End of extended numeric_limits +// Converts possible device side types to their relevant host side native types +inline rocprim::native_half convert_to_native(const rocprim::half& value) +{ + return rocprim::native_half(value); +} + +inline rocprim::native_bfloat16 convert_to_native(const rocprim::bfloat16& value) +{ + return rocprim::native_bfloat16(value); +} + +template +inline auto convert_to_native(const T& value) +{ + return value; +} + // Helper class to generate a vector of special values for any type template struct special_values {