Merge branch 'block-run-length-decode' into 'develop_stream'

Implement block run length decode Closes #638 See merge request amd/libraries/rocPRIM!546
ROCm · Oct 18, 2023 · f7083e0 · f7083e0
2 parents 38ba018 + 7580bdd
commit f7083e0
Show file tree

Hide file tree

Showing 9 changed files with 921 additions and 30 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@ Full documentation for rocPRIM is available at [https://rocprim.readthedocs.io/e
 
 ## [Unreleased rocPRIM-3.0.0 for ROCm 6.0.0]
 ### Added
+ - Added new primitive: `block_run_length_decode`.
 ### Changed
 - Removed deprecated functionality: `reduce_by_key_config`, `MatchAny`, `scan_config`, `scan_by_key_config` and `radix_sort_config`.
 - Renamed `scan_config_v2` to `scan_config`, `scan_by_key_config_v2` to `scan_by_key_config`, `radix_sort_config_v2` to `radix_sort_config`, `reduce_by_key_config_v2` to `reduce_by_key_config`, `radix_sort_config_v2` to `radix_sort_config`.

diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -123,6 +123,7 @@ add_rocprim_benchmark(benchmark_block_histogram.cpp)
 add_rocprim_benchmark(benchmark_block_radix_sort.cpp)
 add_rocprim_benchmark(benchmark_block_radix_rank.cpp)
 add_rocprim_benchmark(benchmark_block_reduce.cpp)
+add_rocprim_benchmark(benchmark_block_run_length_decode.cpp)
 add_rocprim_benchmark(benchmark_block_scan.cpp)
 add_rocprim_benchmark(benchmark_block_sort.cpp)
 add_rocprim_benchmark(benchmark_config_dispatch.cpp)

diff --git a/benchmark/benchmark_block_run_length_decode.cpp b/benchmark/benchmark_block_run_length_decode.cpp
@@ -0,0 +1,242 @@
+// MIT License
+//
+// Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "benchmark/benchmark.h"
+#include "benchmark_utils.hpp"
+#include "cmdparser.hpp"
+
+#include "rocprim/block/block_load.hpp"
+#include "rocprim/block/block_run_length_decode.hpp"
+#include "rocprim/block/block_store.hpp"
+
+#include <random>
+#include <vector>
+
+#ifndef DEFAULT_N
+const size_t DEFAULT_N = 1024 * 1024 * 32;
+#endif
+
+template<class ItemT,
+         class OffsetT,
+         unsigned BlockSize,
+         unsigned RunsPerThread,
+         unsigned DecodedItemsPerThread,
+         unsigned Trials>
+__global__
+    __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT*   d_run_items,
+                                                                     const OffsetT* d_run_offsets,
+                                                                     ItemT*         d_decoded_items,
+                                                                     bool enable_store = false)
+{
+    using BlockRunLengthDecodeT
+        = rocprim::block_run_length_decode<ItemT, BlockSize, RunsPerThread, DecodedItemsPerThread>;
+
+    ItemT   run_items[RunsPerThread];
+    OffsetT run_offsets[RunsPerThread];
+
+    const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x;
+    rocprim::block_load_direct_blocked(global_thread_idx, d_run_items, run_items);
+    rocprim::block_load_direct_blocked(global_thread_idx, d_run_offsets, run_offsets);
+
+    ROCPRIM_SHARED_MEMORY typename BlockRunLengthDecodeT::storage_type temp_storage;
+    BlockRunLengthDecodeT block_run_length_decode(run_items, run_offsets);
+
+    const OffsetT total_decoded_size
+        = d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread]
+          - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread];
+
+#pragma nounroll
+    for(unsigned i = 0; i < Trials; ++i)
+    {
+        OffsetT decoded_window_offset = 0;
+        while(decoded_window_offset < total_decoded_size)
+        {
+            ItemT decoded_items[DecodedItemsPerThread];
+            block_run_length_decode.run_length_decode(decoded_items, decoded_window_offset);
+
+            if(enable_store)
+            {
+                rocprim::block_store_direct_blocked(global_thread_idx,
+                                                    d_decoded_items + decoded_window_offset,
+                                                    decoded_items);
+            }
+
+            decoded_window_offset += BlockSize * DecodedItemsPerThread;
+        }
+    }
+}
+
+template<class ItemT,
+         class OffsetT,
+         unsigned MinRunLength,
+         unsigned MaxRunLength,
+         unsigned BlockSize,
+         unsigned RunsPerThread,
+         unsigned DecodedItemsPerThread,
+         unsigned Trials = 100>
+void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N)
+{
+    constexpr auto runs_per_block  = BlockSize * RunsPerThread;
+    const auto     target_num_runs = 2 * N / (MinRunLength + MaxRunLength);
+    const auto     num_runs
+        = runs_per_block * ((target_num_runs + runs_per_block - 1) / runs_per_block);
+
+    std::vector<ItemT>   run_items(num_runs);
+    std::vector<OffsetT> run_offsets(num_runs + 1);
+
+    std::default_random_engine prng(std::random_device{}());
+    using ItemDistribution = std::conditional_t<std::is_integral<ItemT>::value,
+                                                std::uniform_int_distribution<ItemT>,
+                                                std::uniform_real_distribution<ItemT>>;
+    ItemDistribution                       run_item_dist(0, 100);
+    std::uniform_int_distribution<OffsetT> run_length_dist(MinRunLength, MaxRunLength);
+
+    for(size_t i = 0; i < num_runs; ++i)
+    {
+        run_items[i] = run_item_dist(prng);
+    }
+    for(size_t i = 1; i < num_runs + 1; ++i)
+    {
+        const OffsetT next_run_length = run_length_dist(prng);
+        run_offsets[i]                = run_offsets[i - 1] + next_run_length;
+    }
+    const OffsetT output_length = run_offsets.back();
+
+    ItemT* d_run_items{};
+    HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT)));
+    HIP_CHECK(hipMemcpy(d_run_items,
+                        run_items.data(),
+                        run_items.size() * sizeof(ItemT),
+                        hipMemcpyHostToDevice));
+
+    OffsetT* d_run_offsets{};
+    HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT)));
+    HIP_CHECK(hipMemcpy(d_run_offsets,
+                        run_offsets.data(),
+                        run_offsets.size() * sizeof(OffsetT),
+                        hipMemcpyHostToDevice));
+
+    ItemT* d_output{};
+    HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT)));
+
+    for(auto _ : state)
+    {
+        auto start = std::chrono::high_resolution_clock::now();
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(block_run_length_decode_kernel<ItemT,
+                                                                          OffsetT,
+                                                                          BlockSize,
+                                                                          RunsPerThread,
+                                                                          DecodedItemsPerThread,
+                                                                          Trials>),
+                           dim3(num_runs / runs_per_block),
+                           dim3(BlockSize),
+                           0,
+                           stream,
+                           d_run_items,
+                           d_run_offsets,
+                           d_output);
+        HIP_CHECK(hipPeekAtLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+
+        auto end = std::chrono::high_resolution_clock::now();
+        auto elapsed_seconds
+            = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
+
+        state.SetIterationTime(elapsed_seconds.count());
+    }
+    state.SetBytesProcessed(state.iterations() * output_length * sizeof(ItemT) * Trials);
+    state.SetItemsProcessed(state.iterations() * output_length * Trials);
+
+    HIP_CHECK(hipFree(d_run_items));
+    HIP_CHECK(hipFree(d_run_offsets));
+    HIP_CHECK(hipFree(d_output));
+}
+
+#define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT)                                 \
+    benchmark::RegisterBenchmark("block_run_length_decode<Item Type:" #IT ",Offset Type:" #OT \
+                                 ",Min RunLength:" #MINRL ",Max RunLength:" #MAXRL            \
+                                 ",BlockSize: " #BS ",Runs Per Thread:" #RPT                  \
+                                 ",Decoded Items Per Thread:" #DIPT ">",                      \
+                                 &run_benchmark<IT, OT, MINRL, MAXRL, BS, RPT, DIPT>,         \
+                                 stream,                                                      \
+                                 size)
+
+int main(int argc, char* argv[])
+{
+    cli::Parser parser(argc, argv);
+    parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
+    parser.set_optional<int>("trials", "trials", -1, "number of iterations");
+    parser.run_and_exit_if_error();
+
+    // Parse argv
+    benchmark::Initialize(&argc, argv);
+    const size_t size   = parser.get<size_t>("size");
+    const int    trials = parser.get<int>("trials");
+
+    std::cout << "benchmark_block_run_length_decode" << std::endl;
+
+    // HIP
+    hipStream_t     stream = 0; // default
+    hipDeviceProp_t devProp;
+    int             device_id = 0;
+    HIP_CHECK(hipGetDevice(&device_id));
+    HIP_CHECK(hipGetDeviceProperties(&devProp, device_id));
+    std::cout << "[HIP] Device name: " << devProp.name << std::endl;
+
+    // Add benchmarks
+    std::vector<benchmark::internal::Benchmark*> benchmarks{
+        CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4),
+        CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4),
+        CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4),
+        CREATE_BENCHMARK(int, int, 1, 100, 128, 2, 4),
+        CREATE_BENCHMARK(int, int, 1, 500, 128, 2, 4),
+        CREATE_BENCHMARK(int, int, 1, 1000, 128, 2, 4),
+        CREATE_BENCHMARK(int, int, 1, 5000, 128, 2, 4),
+
+        CREATE_BENCHMARK(double, long long, 1, 5, 128, 2, 4),
+        CREATE_BENCHMARK(double, long long, 1, 10, 128, 2, 4),
+        CREATE_BENCHMARK(double, long long, 1, 50, 128, 2, 4),
+        CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4),
+        CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4),
+        CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4),
+        CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4)};
+
+    // Use manual timing
+    for(auto& b : benchmarks)
+    {
+        b->UseManualTime();
+        b->Unit(benchmark::kMillisecond);
+    }
+
+    // Force number of iterations
+    if(trials > 0)
+    {
+        for(auto& b : benchmarks)
+        {
+            b->Iterations(trials);
+        }
+    }
+
+    // Run benchmarks
+    benchmark::RunSpecifiedBenchmarks();
+    return 0;
+}