Skip to content

Commit

Permalink
Merge branch 'partial-sort' into 'develop_stream'
Browse files Browse the repository at this point in the history
partial_sort and partial_sort_copy

See merge request amd/libraries/rocPRIM!684
  • Loading branch information
NB4444 authored and Naraenda committed Jul 18, 2024
2 parents 4a26848 + 88cb742 commit 10c6175
Show file tree
Hide file tree
Showing 17 changed files with 1,791 additions and 89 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ Documentation for rocPRIM is available at
* Added large segment support for `rocprim:segmented_reduce`.
* Added a parallel `nth_element` device function similar to `std::nth_element`, this function rearranges elements smaller than the n-th before and bigger than the n-th after the n-th element.
* Added deterministic (bitwise reproducible) algorithm variants `rocprim::deterministic_inclusive_scan`, `rocprim::deterministic_exclusive_scan`, `rocprim::deterministic_inclusive_scan_by_key`, `rocprim::deterministic_exclusive_scan_by_key`, and `rocprim::deterministic_reduce_by_key`. These provide run-to-run stable results with non-associative operators such as float operations, at the cost of reduced performance.
* Added a parallel `partial_sort` and `partial_sort_copy` device function similar to `std::partial_sort` and `std::partial_sort_copy`, these functions rearranges elements such that the elements are the same as a sorted list up to and including the middle index.

### Changes

Expand Down
2 changes: 2 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ add_rocprim_benchmark(benchmark_device_merge_sort.cpp)
add_rocprim_benchmark(benchmark_device_merge_sort_block_sort.cpp)
add_rocprim_benchmark(benchmark_device_merge_sort_block_merge.cpp)
add_rocprim_benchmark(benchmark_device_nth_element.cpp)
add_rocprim_benchmark(benchmark_device_partial_sort.cpp)
add_rocprim_benchmark(benchmark_device_partial_sort_copy.cpp)
add_rocprim_benchmark(benchmark_device_partition.cpp)
add_rocprim_benchmark(benchmark_device_radix_sort.cpp)
add_rocprim_benchmark(benchmark_device_radix_sort_block_sort.cpp)
Expand Down
123 changes: 123 additions & 0 deletions benchmark/benchmark_device_partial_sort.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#include "benchmark_device_partial_sort.hpp"
#include "benchmark_utils.hpp"

// CmdParser
#include "cmdparser.hpp"

// Google Benchmark
#include <benchmark/benchmark.h>

// HIP API
#include <hip/hip_runtime.h>

#include <cstddef>
#include <string>

#ifndef DEFAULT_N
const size_t DEFAULT_N = 1024 * 1024 * 32;
#endif

#define CREATE_BENCHMARK_PARTIAL_SORT(TYPE, SMALL_N) \
{ \
const device_partial_sort_benchmark<TYPE> instance(SMALL_N); \
REGISTER_BENCHMARK(benchmarks, size, seed, stream, instance); \
}

#define CREATE_BENCHMARK(TYPE) \
{ \
CREATE_BENCHMARK_PARTIAL_SORT(TYPE, true) \
CREATE_BENCHMARK_PARTIAL_SORT(TYPE, false) \
}

int main(int argc, char* argv[])
{
cli::Parser parser(argc, argv);
parser.set_optional<size_t>("size", "size", DEFAULT_N, "number of values");
parser.set_optional<int>("trials", "trials", -1, "number of iterations");
parser.set_optional<std::string>("name_format",
"name_format",
"human",
"either: json,human,txt");
parser.set_optional<std::string>("seed", "seed", "random", get_seed_message());
parser.run_and_exit_if_error();

// Parse argv
benchmark::Initialize(&argc, argv);
const size_t size = parser.get<size_t>("size");
const int trials = parser.get<int>("trials");
bench_naming::set_format(parser.get<std::string>("name_format"));
const std::string seed_type = parser.get<std::string>("seed");
const managed_seed seed(seed_type);

// HIP
hipStream_t stream = 0; // default

// Benchmark info
add_common_benchmark_info();
benchmark::AddCustomContext("size", std::to_string(size));
benchmark::AddCustomContext("seed", seed_type);

// Add benchmarks
std::vector<benchmark::internal::Benchmark*> benchmarks{};
CREATE_BENCHMARK(int)
CREATE_BENCHMARK(long long)
CREATE_BENCHMARK(int8_t)
CREATE_BENCHMARK(uint8_t)
CREATE_BENCHMARK(rocprim::half)
CREATE_BENCHMARK(short)
CREATE_BENCHMARK(float)

using custom_float2 = custom_type<float, float>;
using custom_double2 = custom_type<double, double>;
using custom_int2 = custom_type<int, int>;
using custom_char_double = custom_type<char, double>;
using custom_longlong_double = custom_type<long long, double>;

CREATE_BENCHMARK(custom_float2)
CREATE_BENCHMARK(custom_double2)
CREATE_BENCHMARK(custom_int2)
CREATE_BENCHMARK(custom_char_double)
CREATE_BENCHMARK(custom_longlong_double)

// Use manual timing
for(auto& b : benchmarks)
{
b->UseManualTime();
b->Unit(benchmark::kMillisecond);
}

// Force number of iterations
if(trials > 0)
{
for(auto& b : benchmarks)
{
b->Iterations(trials);
}
}

// Run benchmarks
benchmark::RunSpecifiedBenchmarks();
return 0;
}
184 changes: 184 additions & 0 deletions benchmark/benchmark_device_partial_sort.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
// MIT License
//
// Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.

#ifndef ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_PARALLEL_HPP_
#define ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_PARALLEL_HPP_

#include "benchmark_utils.hpp"

// Google Benchmark
#include <benchmark/benchmark.h>

// HIP API
#include <hip/hip_runtime.h>

// rocPRIM
#include <rocprim/device/device_partial_sort.hpp>

#include <string>
#include <vector>

#include <cstddef>

template<typename Key = int, typename Config = rocprim::default_config>
struct device_partial_sort_benchmark : public config_autotune_interface
{
bool small_n = false;

device_partial_sort_benchmark(bool SmallN)
{
small_n = SmallN;
}

std::string name() const override
{
using namespace std::string_literals;
return bench_naming::format_name(
"{lvl:device,algo:partial_sort,nth:" + (small_n ? "small"s : "half"s)
+ ",key_type:" + std::string(Traits<Key>::name()) + ",cfg:default_config}");
}

static constexpr unsigned int batch_size = 10;
static constexpr unsigned int warmup_size = 5;

void run(benchmark::State& state,
size_t size,
const managed_seed& seed,
hipStream_t stream) const override
{
using key_type = Key;

size_t middle = 10;

if(!small_n)
{
middle = size / 2;
}

// Generate data
std::vector<key_type> keys_input;
if(std::is_floating_point<key_type>::value)
{
keys_input = get_random_data<key_type>(size,
static_cast<key_type>(-1000),
static_cast<key_type>(1000),
seed.get_0());
}
else
{
keys_input = get_random_data<key_type>(size,
std::numeric_limits<key_type>::min(),
std::numeric_limits<key_type>::max(),
seed.get_0());
}

key_type* d_keys_input;
key_type* d_keys_new_data;
HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(*d_keys_input)));
HIP_CHECK(hipMalloc(&d_keys_new_data, size * sizeof(*d_keys_new_data)));

HIP_CHECK(hipMemcpy(d_keys_new_data,
keys_input.data(),
size * sizeof(*d_keys_input),
hipMemcpyHostToDevice));

rocprim::less<key_type> lesser_op;
void* d_temporary_storage = nullptr;
size_t temporary_storage_bytes = 0;
HIP_CHECK(rocprim::partial_sort(d_temporary_storage,
temporary_storage_bytes,
d_keys_input,
middle,
size,
lesser_op,
stream,
false));

HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes));

// Warm-up
for(size_t i = 0; i < warmup_size; i++)
{
HIP_CHECK(hipMemcpy(d_keys_input,
d_keys_new_data,
size * sizeof(*d_keys_input),
hipMemcpyDeviceToDevice));
HIP_CHECK(rocprim::partial_sort(d_temporary_storage,
temporary_storage_bytes,
d_keys_input,
middle,
size,
lesser_op,
stream,
false));
}
HIP_CHECK(hipDeviceSynchronize());

// HIP events creation
hipEvent_t start, stop;
HIP_CHECK(hipEventCreate(&start));
HIP_CHECK(hipEventCreate(&stop));

for(auto _ : state)
{
float elapsed_mseconds = 0;
for(size_t i = 0; i < batch_size; i++)
{
HIP_CHECK(hipMemcpy(d_keys_input,
d_keys_new_data,
size * sizeof(*d_keys_input),
hipMemcpyDeviceToDevice));
// Record start event
HIP_CHECK(hipEventRecord(start, stream));
HIP_CHECK(rocprim::partial_sort(d_temporary_storage,
temporary_storage_bytes,
d_keys_input,
middle,
size,
lesser_op,
stream,
false));
// Record stop event and wait until it completes
HIP_CHECK(hipEventRecord(stop, stream));
HIP_CHECK(hipEventSynchronize(stop));
float elapsed_mseconds_current;
HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds_current, start, stop));
elapsed_mseconds += elapsed_mseconds_current;
}

state.SetIterationTime(elapsed_mseconds / 1000);
}

// Destroy HIP events
HIP_CHECK(hipEventDestroy(start));
HIP_CHECK(hipEventDestroy(stop));

state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(*d_keys_input));
state.SetItemsProcessed(state.iterations() * batch_size * size);

HIP_CHECK(hipFree(d_temporary_storage));
HIP_CHECK(hipFree(d_keys_input));
HIP_CHECK(hipFree(d_keys_new_data));
}
};

#endif // ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_PARALLEL_HPP_
Loading

0 comments on commit 10c6175

Please sign in to comment.