From 8fdcbbc158e0cbf1251f292dcf7e57c6cfb25e68 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 13 Feb 2023 15:06:10 +0800 Subject: [PATCH 1/4] Extract device algorithms. --- src/common/algorithm.cuh | 185 ++++++++++++++++++++++-- src/common/device_helpers.cuh | 155 ++++---------------- src/common/stats.cuh | 5 +- src/data/iterative_dmatrix.h | 2 +- src/data/proxy_dmatrix.h | 2 +- src/data/sparse_page_dmatrix.h | 2 +- src/metric/auc.cc | 28 ++-- src/metric/auc.cu | 44 +++--- src/metric/auc.h | 17 ++- tests/cpp/common/test_algorithm.cu | 97 +++++++++++++ tests/cpp/common/test_device_helpers.cu | 24 --- 11 files changed, 346 insertions(+), 215 deletions(-) create mode 100644 tests/cpp/common/test_algorithm.cu diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh index dfce723da000..2d2b7db1173c 100644 --- a/src/common/algorithm.cuh +++ b/src/common/algorithm.cuh @@ -1,27 +1,184 @@ -/*! - * Copyright 2022 by XGBoost Contributors +/** + * Copyright 2022-2023 by XGBoost Contributors */ #pragma once -#include // thrust::upper_bound -#include // thrust::seq +#include // copy +#include // stable_sort_by_key +#include // DispatchSegmentedRadixSort,NullType,DoubleBuffer +#include // distance +#include // numeric_limits +#include // conditional_t,remove_const_t + +#include "cuda_context.cuh" // CUDAContext +#include "device_helpers.cuh" // TemporaryArray #include "xgboost/base.h" -#include "xgboost/span.h" +#include "xgboost/context.h" // Context +#include "xgboost/span.h" // Span,byte namespace xgboost { namespace common { -namespace cuda { -template -size_t XGBOOST_DEVICE SegmentId(It first, It last, size_t idx) { - size_t segment_id = thrust::upper_bound(thrust::seq, first, last, idx) - 1 - first; - return segment_id; +namespace detail { +// Wrapper around cub sort to define is_decending +template +static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_storage, + size_t &temp_storage_bytes, const KeyT *d_keys_in, + KeyT *d_keys_out, int num_items, int num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, int begin_bit = 0, + int end_bit = sizeof(KeyT) * 8, + bool debug_synchronous = false) { + typedef int OffsetT; + + // Null value type + cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + cub::DoubleBuffer d_values; + + dh::safe_cuda((cub::DispatchSegmentedRadixSort< + IS_DESCENDING, KeyT, cub::NullType, BeginOffsetIteratorT, EndOffsetIteratorT, + OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, + num_segments, d_begin_offsets, d_end_offsets, begin_bit, + end_bit, false, ctx->Stream(), debug_synchronous))); } -template -size_t XGBOOST_DEVICE SegmentId(Span segments_ptr, size_t idx) { - return SegmentId(segments_ptr.cbegin(), segments_ptr.cend(), idx); +// Wrapper around cub sort for easier `descending` sort. +template +void DeviceSegmentedRadixSortPair(void *d_temp_storage, size_t &temp_storage_bytes, + const KeyT *d_keys_in, // NOLINT + KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, + size_t num_items, size_t num_segments, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream, + int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) { + cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); + cub::DoubleBuffer d_values(const_cast(d_values_in), d_values_out); + // In old version of cub, num_items in dispatch is also int32_t, no way to change. + using OffsetT = + std::conditional_t(), size_t, int32_t>; + CHECK_LE(num_items, std::numeric_limits::max()); + // For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation + +#if THRUST_MAJOR_VERSION >= 2 + dh::safe_cuda((cub::DispatchSegmentedRadixSort< + descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, + OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, + num_segments, d_begin_offsets, d_end_offsets, begin_bit, + end_bit, false, stream))); +#elif (THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION >= 13) + dh::safe_cuda((cub::DispatchSegmentedRadixSort< + descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, + OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, + num_segments, d_begin_offsets, d_end_offsets, begin_bit, + end_bit, false, stream, false))); +#else + dh::safe_cuda( + (cub::DispatchSegmentedRadixSort::Dispatch(d_temp_storage, temp_storage_bytes, + d_keys, d_values, num_items, num_segments, + d_begin_offsets, d_end_offsets, begin_bit, + end_bit, false, stream, false))); +#endif +} +} // namespace detail + +template +void SegmentedSequence(Context const *ctx, Span d_offset_ptr, Span out_sequence) { + dh::LaunchN(out_sequence.size(), ctx->CUDACtx()->Stream(), + [out_sequence, d_offset_ptr] __device__(size_t idx) { + auto group = dh::SegmentId(d_offset_ptr, idx); + out_sequence[idx] = idx - d_offset_ptr[group]; + }); +} + +template +inline void SegmentedSortKeys(Context const *ctx, Span group_ptr, + Span out_sorted_values) { + CHECK_GE(group_ptr.size(), 1ul); + size_t n_groups = group_ptr.size() - 1; + size_t bytes = 0; + auto const *cuctx = ctx->CUDACtx(); + CHECK(cuctx); + detail::DeviceSegmentedRadixSortKeys( + cuctx, nullptr, bytes, out_sorted_values.data(), out_sorted_values.data(), + out_sorted_values.size(), n_groups, group_ptr.data(), group_ptr.data() + 1); + dh::TemporaryArray temp_storage(bytes); + detail::DeviceSegmentedRadixSortKeys( + cuctx, temp_storage.data().get(), bytes, out_sorted_values.data(), out_sorted_values.data(), + out_sorted_values.size(), n_groups, group_ptr.data(), group_ptr.data() + 1); +} + +/** + * \brief Create sorted index for data with multiple segments. + * + * \tparam accending sorted in non-decreasing order. + * \tparam per_seg_index Index starts from 0 for each segment if true, otherwise the + * the index span the whole data. + */ +template +void SegmentedArgSort(Context const *ctx, Span values, Span group_ptr, + Span sorted_idx) { + CHECK_GE(group_ptr.size(), 1ul); + size_t n_groups = group_ptr.size() - 1; + size_t bytes = 0; + if (per_seg_index) { + SegmentedSequence(ctx, group_ptr, sorted_idx); + } else { + dh::Iota(sorted_idx); + } + dh::TemporaryArray> values_out(values.size()); + dh::TemporaryArray> sorted_idx_out(sorted_idx.size()); + + detail::DeviceSegmentedRadixSortPair( + nullptr, bytes, values.data(), values_out.data().get(), sorted_idx.data(), + sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(), + group_ptr.data() + 1, ctx->CUDACtx()->Stream()); + dh::TemporaryArray temp_storage(bytes); + detail::DeviceSegmentedRadixSortPair( + temp_storage.data().get(), bytes, values.data(), values_out.data().get(), sorted_idx.data(), + sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(), + group_ptr.data() + 1, ctx->CUDACtx()->Stream()); + + dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(), + sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice)); +} + +/** + * \brief Different from the radix-sort-based argsort, this one can handle cases where + * segment doesn't start from 0, but as a result it uses comparison sort. + */ +template +void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, ValIt val_begin, + ValIt val_end, dh::device_vector *p_sorted_idx) { + using Tup = thrust::tuple; + auto &sorted_idx = *p_sorted_idx; + size_t n = std::distance(val_begin, val_end); + sorted_idx.resize(n); + dh::Iota(dh::ToSpan(sorted_idx)); + dh::device_vector keys(sorted_idx.size()); + auto key_it = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), + [=] XGBOOST_DEVICE(size_t i) -> Tup { + int32_t leaf_idx; + if (i < *seg_begin) { + leaf_idx = -1; + } else { + leaf_idx = dh::SegmentId(seg_begin, seg_end, i); + } + auto residue = val_begin[i]; + return thrust::make_tuple(leaf_idx, residue); + }); + thrust::copy(ctx->CUDACtx()->CTP(), key_it, key_it + keys.size(), keys.begin()); + + dh::XGBDeviceAllocator alloc; + thrust::stable_sort_by_key(ctx->CUDACtx()->TP(), keys.begin(), keys.end(), sorted_idx.begin(), + [=] XGBOOST_DEVICE(Tup const &l, Tup const &r) { + if (thrust::get<0>(l) != thrust::get<0>(r)) { + return thrust::get<0>(l) < thrust::get<0>(r); // segment index + } + return thrust::get<1>(l) < thrust::get<1>(r); // residue + }); } -} // namespace cuda } // namespace common } // namespace xgboost diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 4a225d34b890..d56965dfeee7 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1,43 +1,39 @@ -/*! - * Copyright 2017-2022 XGBoost contributors +/** + * Copyright 2017-2023 XGBoost contributors */ #pragma once +#include // thrust::upper_bound +#include #include #include -#include +#include // thrust::seq +#include // gather #include -#include +#include // make_transform_output_iterator +#include #include #include #include #include -#include - #include -#include -#include #include -#include - -#include -#include #include #include +#include +#include #include #include #include -#include #include - -#include "xgboost/logging.h" -#include "xgboost/host_device_vector.h" -#include "xgboost/span.h" -#include "xgboost/global_config.h" +#include #include "../collective/communicator-inl.h" #include "common.h" -#include "algorithm.cuh" +#include "xgboost/global_config.h" +#include "xgboost/host_device_vector.h" +#include "xgboost/logging.h" +#include "xgboost/span.h" #ifdef XGBOOST_USE_NCCL #include "nccl.h" @@ -1015,7 +1011,16 @@ XGBOOST_DEVICE thrust::transform_iterator MakeTransformIt return thrust::transform_iterator(iter, func); } -using xgboost::common::cuda::SegmentId; // import it for compatibility +template +size_t XGBOOST_DEVICE SegmentId(It first, It last, size_t idx) { + size_t segment_id = thrust::upper_bound(thrust::seq, first, last, idx) - 1 - first; + return segment_id; +} + +template +size_t XGBOOST_DEVICE SegmentId(xgboost::common::Span segments_ptr, size_t idx) { + return SegmentId(segments_ptr.cbegin(), segments_ptr.cend(), idx); +} namespace detail { template @@ -1288,114 +1293,6 @@ void ArgSort(xgboost::common::Span keys, xgboost::common::Span sorted_i sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice)); } -namespace detail { -// Wrapper around cub sort for easier `descending` sort. -template -void DeviceSegmentedRadixSortPair( - void *d_temp_storage, size_t &temp_storage_bytes, const KeyT *d_keys_in, // NOLINT - KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, - size_t num_items, size_t num_segments, BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, int begin_bit = 0, - int end_bit = sizeof(KeyT) * 8) { - cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); - cub::DoubleBuffer d_values(const_cast(d_values_in), - d_values_out); - // In old version of cub, num_items in dispatch is also int32_t, no way to change. - using OffsetT = - std::conditional_t(), size_t, - int32_t>; - CHECK_LE(num_items, std::numeric_limits::max()); - // For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation - -#if THRUST_MAJOR_VERSION >= 2 - safe_cuda((cub::DispatchSegmentedRadixSort< - descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, - OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, - d_values, num_items, num_segments, - d_begin_offsets, d_end_offsets, begin_bit, - end_bit, false, nullptr))); -#elif (THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION >= 13) - safe_cuda((cub::DispatchSegmentedRadixSort< - descending, KeyT, ValueT, BeginOffsetIteratorT, EndOffsetIteratorT, - OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, - d_values, num_items, num_segments, - d_begin_offsets, d_end_offsets, begin_bit, - end_bit, false, nullptr, false))); -#else - safe_cuda((cub::DispatchSegmentedRadixSort< - descending, KeyT, ValueT, BeginOffsetIteratorT, - OffsetT>::Dispatch(d_temp_storage, temp_storage_bytes, d_keys, - d_values, num_items, num_segments, - d_begin_offsets, d_end_offsets, begin_bit, - end_bit, false, nullptr, false))); -#endif - -} -} // namespace detail - -template -void SegmentedArgSort(xgboost::common::Span values, - xgboost::common::Span group_ptr, - xgboost::common::Span sorted_idx) { - CHECK_GE(group_ptr.size(), 1ul); - size_t n_groups = group_ptr.size() - 1; - size_t bytes = 0; - Iota(sorted_idx); - TemporaryArray> values_out(values.size()); - TemporaryArray> sorted_idx_out(sorted_idx.size()); - - detail::DeviceSegmentedRadixSortPair( - nullptr, bytes, values.data(), values_out.data().get(), sorted_idx.data(), - sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(), - group_ptr.data() + 1); - TemporaryArray temp_storage(bytes); - detail::DeviceSegmentedRadixSortPair( - temp_storage.data().get(), bytes, values.data(), values_out.data().get(), - sorted_idx.data(), sorted_idx_out.data().get(), sorted_idx.size(), - n_groups, group_ptr.data(), group_ptr.data() + 1); - - safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(), - sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice)); -} - -/** - * \brief Different from the above one, this one can handle cases where segment doesn't - * start from 0, but as a result it uses comparison sort. - */ -template -void SegmentedArgSort(SegIt seg_begin, SegIt seg_end, ValIt val_begin, ValIt val_end, - dh::device_vector *p_sorted_idx) { - using Tup = thrust::tuple; - auto &sorted_idx = *p_sorted_idx; - size_t n = std::distance(val_begin, val_end); - sorted_idx.resize(n); - dh::Iota(dh::ToSpan(sorted_idx)); - dh::device_vector keys(sorted_idx.size()); - auto key_it = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), - [=] XGBOOST_DEVICE(size_t i) -> Tup { - int32_t leaf_idx; - if (i < *seg_begin) { - leaf_idx = -1; - } else { - leaf_idx = dh::SegmentId(seg_begin, seg_end, i); - } - auto residue = val_begin[i]; - return thrust::make_tuple(leaf_idx, residue); - }); - dh::XGBCachingDeviceAllocator caching; - thrust::copy(thrust::cuda::par(caching), key_it, key_it + keys.size(), keys.begin()); - - dh::XGBDeviceAllocator alloc; - thrust::stable_sort_by_key(thrust::cuda::par(alloc), keys.begin(), keys.end(), sorted_idx.begin(), - [=] XGBOOST_DEVICE(Tup const &l, Tup const &r) { - if (thrust::get<0>(l) != thrust::get<0>(r)) { - return thrust::get<0>(l) < thrust::get<0>(r); // segment index - } - return thrust::get<1>(l) < thrust::get<1>(r); // residue - }); -} - class CUDAStreamView; class CUDAEvent { @@ -1412,7 +1309,7 @@ class CUDAEvent { CUDAEvent(CUDAEvent const &that) = delete; CUDAEvent &operator=(CUDAEvent const &that) = delete; - inline void Record(CUDAStreamView stream); // NOLINT + inline void Record(CUDAStreamView stream); // NOLINT operator cudaEvent_t() const { return event_; } // NOLINT }; diff --git a/src/common/stats.cuh b/src/common/stats.cuh index b95f6866ca5c..f31233461f6d 100644 --- a/src/common/stats.cuh +++ b/src/common/stats.cuh @@ -17,6 +17,7 @@ #include // std::numeric_limits #include // std::is_floating_point,std::iterator_traits +#include "algorithm.cuh" // SegmentedArgMergeSort #include "cuda_context.cuh" // CUDAContext #include "device_helpers.cuh" #include "xgboost/context.h" // Context @@ -150,7 +151,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se ValIt val_begin, ValIt val_end, HostDeviceVector* quantiles) { dh::device_vector sorted_idx; using Tup = thrust::tuple; - dh::SegmentedArgSort(seg_begin, seg_end, val_begin, val_end, &sorted_idx); + common::SegmentedArgMergeSort(ctx, seg_begin, seg_end, val_begin, val_end, &sorted_idx); auto n_segments = std::distance(seg_begin, seg_end) - 1; if (n_segments <= 0) { return; @@ -203,7 +204,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b HostDeviceVector* quantiles) { auto cuctx = ctx->CUDACtx(); dh::device_vector sorted_idx; - dh::SegmentedArgSort(seg_beg, seg_end, val_begin, val_end, &sorted_idx); + common::SegmentedArgMergeSort(ctx, seg_beg, seg_end, val_begin, val_end, &sorted_idx); auto d_sorted_idx = dh::ToSpan(sorted_idx); std::size_t n_weights = std::distance(w_begin, w_end); dh::device_vector weights_cdf(n_weights); diff --git a/src/data/iterative_dmatrix.h b/src/data/iterative_dmatrix.h index 4df2c97531d4..28c4087c419a 100644 --- a/src/data/iterative_dmatrix.h +++ b/src/data/iterative_dmatrix.h @@ -86,7 +86,7 @@ class IterativeDMatrix : public DMatrix { LOG(FATAL) << "Slicing DMatrix is not supported for Quantile DMatrix."; return nullptr; } - DMatrix *SliceCol(int num_slices, int slice_id) override { + DMatrix *SliceCol(int, int) override { LOG(FATAL) << "Slicing DMatrix columns is not supported for Quantile DMatrix."; return nullptr; } diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h index 6c8a04077f79..fa55a481f582 100644 --- a/src/data/proxy_dmatrix.h +++ b/src/data/proxy_dmatrix.h @@ -87,7 +87,7 @@ class DMatrixProxy : public DMatrix { LOG(FATAL) << "Slicing DMatrix is not supported for Proxy DMatrix."; return nullptr; } - DMatrix* SliceCol(int num_slices, int slice_id) override { + DMatrix* SliceCol(int, int) override { LOG(FATAL) << "Slicing DMatrix columns is not supported for Proxy DMatrix."; return nullptr; } diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h index 5157116bf105..aa0be69845aa 100644 --- a/src/data/sparse_page_dmatrix.h +++ b/src/data/sparse_page_dmatrix.h @@ -107,7 +107,7 @@ class SparsePageDMatrix : public DMatrix { LOG(FATAL) << "Slicing DMatrix is not supported for external memory."; return nullptr; } - DMatrix *SliceCol(int num_slices, int slice_id) override { + DMatrix *SliceCol(int, int) override { LOG(FATAL) << "Slicing DMatrix columns is not supported for external memory."; return nullptr; } diff --git a/src/metric/auc.cc b/src/metric/auc.cc index 8a2e2199e604..8248d24ba235 100644 --- a/src/metric/auc.cc +++ b/src/metric/auc.cc @@ -345,8 +345,8 @@ class EvalROCAUC : public EvalAUC { std::tie(auc, valid_groups) = RankingAUC(predts.ConstHostVector(), info, n_threads); } else { - std::tie(auc, valid_groups) = GPURankingAUC( - predts.ConstDeviceSpan(), info, ctx_->gpu_id, &this->d_cache_); + std::tie(auc, valid_groups) = + GPURankingAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_); } return std::make_pair(auc, valid_groups); } @@ -360,8 +360,7 @@ class EvalROCAUC : public EvalAUC { auc = MultiClassOVR(predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC); } else { - auc = GPUMultiClassROCAUC(predts.ConstDeviceSpan(), info, ctx_->gpu_id, - &this->d_cache_, n_classes); + auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes); } return auc; } @@ -398,14 +397,15 @@ std::tuple GPUBinaryROCAUC(common::Span, Me return {}; } -double GPUMultiClassROCAUC(common::Span, MetaInfo const &, std::int32_t, +double GPUMultiClassROCAUC(Context const *, common::Span, MetaInfo const &, std::shared_ptr *, std::size_t) { common::AssertGPUSupport(); return 0.0; } -std::pair GPURankingAUC(common::Span, MetaInfo const &, - std::int32_t, std::shared_ptr *) { +std::pair GPURankingAUC(Context const *, common::Span, + MetaInfo const &, + std::shared_ptr *) { common::AssertGPUSupport(); return {}; } @@ -437,8 +437,7 @@ class EvalPRAUC : public EvalAUC { return MultiClassOVR(predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC); } else { - return GPUMultiClassPRAUC(predts.ConstDeviceSpan(), info, ctx_->gpu_id, - &d_cache_, n_classes); + return GPUMultiClassPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_, n_classes); } } @@ -455,8 +454,8 @@ class EvalPRAUC : public EvalAUC { std::tie(auc, valid_groups) = RankingAUC(predts.ConstHostVector(), info, n_threads); } else { - std::tie(auc, valid_groups) = GPURankingPRAUC( - predts.ConstDeviceSpan(), info, ctx_->gpu_id, &d_cache_); + std::tie(auc, valid_groups) = + GPURankingPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_); } return std::make_pair(auc, valid_groups); } @@ -476,14 +475,15 @@ std::tuple GPUBinaryPRAUC(common::Span, Met return {}; } -double GPUMultiClassPRAUC(common::Span, MetaInfo const &, std::int32_t, +double GPUMultiClassPRAUC(Context const *, common::Span, MetaInfo const &, std::shared_ptr *, std::size_t) { common::AssertGPUSupport(); return {}; } -std::pair GPURankingPRAUC(common::Span, MetaInfo const &, - std::int32_t, std::shared_ptr *) { +std::pair GPURankingPRAUC(Context const *, common::Span, + MetaInfo const &, + std::shared_ptr *) { common::AssertGPUSupport(); return {}; } diff --git a/src/metric/auc.cu b/src/metric/auc.cu index 788d9a5698a5..a59c14bddf82 100644 --- a/src/metric/auc.cu +++ b/src/metric/auc.cu @@ -12,6 +12,7 @@ #include #include "../collective/device_communicator.cuh" +#include "../common/algorithm.cuh" // SegmentedArgSort #include "../common/optional_weight.h" // OptionalWeights #include "../common/threading_utils.cuh" // UnravelTrapeziodIdx,SegmentedTrapezoidThreads #include "auc.h" @@ -436,7 +437,7 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span predts, +void MultiClassSortedIdx(Context const *ctx, common::Span predts, common::Span d_class_ptr, std::shared_ptr cache) { size_t n_classes = d_class_ptr.size() - 1; @@ -449,11 +450,11 @@ void MultiClassSortedIdx(common::Span predts, dh::LaunchN(n_classes + 1, [=] XGBOOST_DEVICE(size_t i) { d_class_ptr[i] = i * n_samples; }); auto d_sorted_idx = dh::ToSpan(cache->sorted_idx); - dh::SegmentedArgSort(d_predts_t, d_class_ptr, d_sorted_idx); + common::SegmentedArgSort(ctx, d_predts_t, d_class_ptr, d_sorted_idx); } -double GPUMultiClassROCAUC(common::Span predts, MetaInfo const &info, - std::int32_t device, std::shared_ptr *p_cache, +double GPUMultiClassROCAUC(Context const *ctx, common::Span predts, + MetaInfo const &info, std::shared_ptr *p_cache, std::size_t n_classes) { auto& cache = *p_cache; InitCacheOnce(predts, p_cache); @@ -462,13 +463,13 @@ double GPUMultiClassROCAUC(common::Span predts, MetaInfo const &inf * Create sorted index for each class */ dh::TemporaryArray class_ptr(n_classes + 1, 0); - MultiClassSortedIdx(predts, dh::ToSpan(class_ptr), cache); + MultiClassSortedIdx(ctx, predts, dh::ToSpan(class_ptr), cache); auto fn = [] XGBOOST_DEVICE(double fp_prev, double fp, double tp_prev, double tp, size_t /*class_id*/) { return TrapezoidArea(fp_prev, fp, tp_prev, tp); }; - return GPUMultiClassAUCOVR(info, device, dh::ToSpan(class_ptr), n_classes, cache, fn); + return GPUMultiClassAUCOVR(info, ctx->gpu_id, dh::ToSpan(class_ptr), n_classes, cache, fn); } namespace { @@ -480,8 +481,8 @@ struct RankScanItem { }; } // anonymous namespace -std::pair GPURankingAUC(common::Span predts, - MetaInfo const &info, std::int32_t device, +std::pair GPURankingAUC(Context const *ctx, common::Span predts, + MetaInfo const &info, std::shared_ptr *p_cache) { auto& cache = *p_cache; InitCacheOnce(predts, p_cache); @@ -509,10 +510,10 @@ std::pair GPURankingAUC(common::Span predts, /** * Sort the labels */ - auto d_labels = info.labels.View(device); + auto d_labels = info.labels.View(ctx->gpu_id); auto d_sorted_idx = dh::ToSpan(cache->sorted_idx); - dh::SegmentedArgSort(d_labels.Values(), d_group_ptr, d_sorted_idx); + common::SegmentedArgSort(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx); auto d_weights = info.weights_.ConstDeviceSpan(); @@ -640,8 +641,8 @@ std::tuple GPUBinaryPRAUC(common::Span pred return std::make_tuple(1.0, 1.0, auc); } -double GPUMultiClassPRAUC(common::Span predts, MetaInfo const &info, - std::int32_t device, std::shared_ptr *p_cache, +double GPUMultiClassPRAUC(Context const *ctx, common::Span predts, + MetaInfo const &info, std::shared_ptr *p_cache, std::size_t n_classes) { auto& cache = *p_cache; InitCacheOnce(predts, p_cache); @@ -651,7 +652,7 @@ double GPUMultiClassPRAUC(common::Span predts, MetaInfo const &info */ dh::TemporaryArray class_ptr(n_classes + 1, 0); auto d_class_ptr = dh::ToSpan(class_ptr); - MultiClassSortedIdx(predts, d_class_ptr, cache); + MultiClassSortedIdx(ctx, predts, d_class_ptr, cache); auto d_sorted_idx = dh::ToSpan(cache->sorted_idx); auto d_weights = info.weights_.ConstDeviceSpan(); @@ -659,7 +660,7 @@ double GPUMultiClassPRAUC(common::Span predts, MetaInfo const &info /** * Get total positive/negative */ - auto labels = info.labels.View(device); + auto labels = info.labels.View(ctx->gpu_id); auto n_samples = info.num_row_; dh::caching_device_vector totals(n_classes); auto key_it = @@ -692,7 +693,7 @@ double GPUMultiClassPRAUC(common::Span predts, MetaInfo const &info return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp, d_totals[class_id].first); }; - return GPUMultiClassAUCOVR(info, device, d_class_ptr, n_classes, cache, fn); + return GPUMultiClassAUCOVR(info, ctx->gpu_id, d_class_ptr, n_classes, cache, fn); } template @@ -815,10 +816,11 @@ GPURankingPRAUCImpl(common::Span predts, MetaInfo const &info, return std::make_pair(auc, n_groups - invalid_groups); } -std::pair GPURankingPRAUC(common::Span predts, - MetaInfo const &info, std::int32_t device, +std::pair GPURankingPRAUC(Context const *ctx, + common::Span predts, + MetaInfo const &info, std::shared_ptr *p_cache) { - dh::safe_cuda(cudaSetDevice(device)); + dh::safe_cuda(cudaSetDevice(ctx->gpu_id)); if (predts.empty()) { return std::make_pair(0.0, static_cast(0)); } @@ -836,10 +838,10 @@ std::pair GPURankingPRAUC(common::Span predt * Create sorted index for each group */ auto d_sorted_idx = dh::ToSpan(cache->sorted_idx); - dh::SegmentedArgSort(predts, d_group_ptr, d_sorted_idx); + common::SegmentedArgSort(ctx, predts, d_group_ptr, d_sorted_idx); dh::XGBDeviceAllocator alloc; - auto labels = info.labels.View(device); + auto labels = info.labels.View(ctx->gpu_id); if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()), dh::tend(labels.Values()), PRAUCLabelInvalid{})) { InvalidLabels(); @@ -878,7 +880,7 @@ std::pair GPURankingPRAUC(common::Span predt return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp, d_totals[group_id].first); }; - return GPURankingPRAUCImpl(predts, info, d_group_ptr, device, cache, fn); + return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->gpu_id, cache, fn); } } // namespace metric } // namespace xgboost diff --git a/src/metric/auc.h b/src/metric/auc.h index b92188068ea4..d8e7f4344cb6 100644 --- a/src/metric/auc.h +++ b/src/metric/auc.h @@ -33,12 +33,12 @@ std::tuple GPUBinaryROCAUC(common::Span pre MetaInfo const &info, std::int32_t device, std::shared_ptr *p_cache); -double GPUMultiClassROCAUC(common::Span predts, MetaInfo const &info, - std::int32_t device, std::shared_ptr *cache, +double GPUMultiClassROCAUC(Context const *ctx, common::Span predts, + MetaInfo const &info, std::shared_ptr *p_cache, std::size_t n_classes); -std::pair GPURankingAUC(common::Span predts, - MetaInfo const &info, std::int32_t device, +std::pair GPURankingAUC(Context const *ctx, common::Span predts, + MetaInfo const &info, std::shared_ptr *cache); /********** @@ -48,12 +48,13 @@ std::tuple GPUBinaryPRAUC(common::Span pred MetaInfo const &info, std::int32_t device, std::shared_ptr *p_cache); -double GPUMultiClassPRAUC(common::Span predts, MetaInfo const &info, - std::int32_t device, std::shared_ptr *cache, +double GPUMultiClassPRAUC(Context const *ctx, common::Span predts, + MetaInfo const &info, std::shared_ptr *p_cache, std::size_t n_classes); -std::pair GPURankingPRAUC(common::Span predts, - MetaInfo const &info, std::int32_t device, +std::pair GPURankingPRAUC(Context const *ctx, + common::Span predts, + MetaInfo const &info, std::shared_ptr *cache); namespace detail { diff --git a/tests/cpp/common/test_algorithm.cu b/tests/cpp/common/test_algorithm.cu new file mode 100644 index 000000000000..8e5a0939867f --- /dev/null +++ b/tests/cpp/common/test_algorithm.cu @@ -0,0 +1,97 @@ +/** + * Copyright 2023 by XGBoost Contributors + */ +#include +#include // copy +#include // sequence +#include // is_sorted + +#include // is_sorted +#include // size_t + +#include "../../../src/common/algorithm.cuh" +#include "../../../src/common/device_helpers.cuh" +#include "../helpers.h" // CreateEmptyGenericParam + +namespace xgboost { +namespace common { +void TestSegmentedArgSort() { + Context ctx; + ctx.gpu_id = 0; + + size_t constexpr kElements = 100, kGroups = 3; + dh::device_vector sorted_idx(kElements, 0); + dh::device_vector offset_ptr(kGroups + 1, 0); + offset_ptr[0] = 0; + offset_ptr[1] = 2; + offset_ptr[2] = 78; + offset_ptr[kGroups] = kElements; + auto d_offset_ptr = dh::ToSpan(offset_ptr); + + auto d_sorted_idx = dh::ToSpan(sorted_idx); + dh::LaunchN(sorted_idx.size(), [=] XGBOOST_DEVICE(size_t idx) { + auto group = dh::SegmentId(d_offset_ptr, idx); + d_sorted_idx[idx] = idx - d_offset_ptr[group]; + }); + + dh::device_vector values(kElements, 0.0f); + thrust::sequence(values.begin(), values.end(), 0.0f); + SegmentedArgSort(&ctx, dh::ToSpan(values), d_offset_ptr, d_sorted_idx); + + std::vector h_sorted_index(sorted_idx.size()); + thrust::copy(sorted_idx.begin(), sorted_idx.end(), h_sorted_index.begin()); + + for (size_t i = 1; i < kGroups + 1; ++i) { + auto group_idx = common::Span(h_sorted_index) + .subspan(offset_ptr[i - 1], offset_ptr[i] - offset_ptr[i - 1]); + ASSERT_TRUE(std::is_sorted(group_idx.begin(), group_idx.end(), std::greater<>{})); + ASSERT_EQ(group_idx.back(), 0); + for (auto j : group_idx) { + ASSERT_LT(j, group_idx.size()); + } + } +} + +TEST(Algorithms, SegmentedArgSort) { TestSegmentedArgSort(); } + +TEST(Algorithms, ArgSort) { + Context ctx; + ctx.gpu_id = 0; + + dh::device_vector values(20); + dh::Iota(dh::ToSpan(values)); // accending + dh::device_vector sorted_idx(20); + dh::ArgSort(dh::ToSpan(values), dh::ToSpan(sorted_idx)); // sort to descending + ASSERT_TRUE(thrust::is_sorted(thrust::device, sorted_idx.begin(), sorted_idx.end(), + thrust::greater{})); + + dh::Iota(dh::ToSpan(values)); + dh::device_vector groups(3); + groups[0] = 0; + groups[1] = 10; + groups[2] = 20; + SegmentedArgSort(&ctx, dh::ToSpan(values), dh::ToSpan(groups), + dh::ToSpan(sorted_idx)); + ASSERT_FALSE(thrust::is_sorted(thrust::device, sorted_idx.begin(), sorted_idx.end(), + thrust::greater{})); + ASSERT_TRUE( + thrust::is_sorted(sorted_idx.begin(), sorted_idx.begin() + 10, thrust::greater{})); + ASSERT_TRUE( + thrust::is_sorted(sorted_idx.begin() + 10, sorted_idx.end(), thrust::greater{})); +} + +TEST(Algorithms, SegmentedSequence) { + dh::device_vector idx(16); + dh::device_vector ptr(3); + Context ctx = CreateEmptyGenericParam(0); + ptr[0] = 0; + ptr[1] = 4; + ptr[2] = idx.size(); + SegmentedSequence(&ctx, dh::ToSpan(ptr), dh::ToSpan(idx)); + ASSERT_EQ(idx[0], 0); + ASSERT_EQ(idx[4], 0); + ASSERT_EQ(idx[3], 3); + ASSERT_EQ(idx[15], 11); +} +} // namespace common +} // namespace xgboost diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index 1e943a3a097b..7ae8faf03030 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -172,28 +172,4 @@ TEST(Allocator, OOM) { // Clear last error so we don't fail subsequent tests cudaGetLastError(); } - -TEST(DeviceHelpers, ArgSort) { - dh::device_vector values(20); - dh::Iota(dh::ToSpan(values)); // accending - dh::device_vector sorted_idx(20); - dh::ArgSort(dh::ToSpan(values), dh::ToSpan(sorted_idx)); // sort to descending - ASSERT_TRUE(thrust::is_sorted(thrust::device, sorted_idx.begin(), - sorted_idx.end(), thrust::greater{})); - - dh::Iota(dh::ToSpan(values)); - dh::device_vector groups(3); - groups[0] = 0; - groups[1] = 10; - groups[2] = 20; - dh::SegmentedArgSort(dh::ToSpan(values), dh::ToSpan(groups), - dh::ToSpan(sorted_idx)); - ASSERT_FALSE(thrust::is_sorted(thrust::device, sorted_idx.begin(), - sorted_idx.end(), thrust::greater{})); - ASSERT_TRUE(thrust::is_sorted(sorted_idx.begin(), sorted_idx.begin() + 10, - thrust::greater{})); - ASSERT_TRUE(thrust::is_sorted(sorted_idx.begin() + 10, sorted_idx.end(), - thrust::greater{})); -} - } // namespace xgboost From b40c7d92ed213dd6469b1a3c09139f30a8f73efe Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 13 Feb 2023 16:11:11 +0800 Subject: [PATCH 2/4] tidy. --- src/common/algorithm.cuh | 54 ++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh index 2d2b7db1173c..10967cf2fe43 100644 --- a/src/common/algorithm.cuh +++ b/src/common/algorithm.cuh @@ -1,20 +1,26 @@ /** * Copyright 2022-2023 by XGBoost Contributors */ -#pragma once +#ifndef XGBOOST_COMMON_ALGORITHM_CUH_ +#define XGBOOST_COMMON_ALGORITHM_CUH_ #include // copy #include // stable_sort_by_key +#include // tuple,get +#include // size_t +#include // int32_t #include // DispatchSegmentedRadixSort,NullType,DoubleBuffer #include // distance #include // numeric_limits #include // conditional_t,remove_const_t +#include "common.h" // safe_cuda #include "cuda_context.cuh" // CUDAContext -#include "device_helpers.cuh" // TemporaryArray -#include "xgboost/base.h" +#include "device_helpers.cuh" // TemporaryArray,SegmentId,LaunchN,Iota,device_vector +#include "xgboost/base.h" // XGBOOST_DEVICE #include "xgboost/context.h" // Context +#include "xgboost/logging.h" // CHECK #include "xgboost/span.h" // Span,byte namespace xgboost { @@ -24,13 +30,13 @@ namespace detail { template static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_storage, - size_t &temp_storage_bytes, const KeyT *d_keys_in, - KeyT *d_keys_out, int num_items, int num_segments, - BeginOffsetIteratorT d_begin_offsets, + std::size_t &temp_storage_bytes, // NOLINT + const KeyT *d_keys_in, KeyT *d_keys_out, int num_items, + int num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8, bool debug_synchronous = false) { - typedef int OffsetT; + using OffsetT = int; // Null value type cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); @@ -46,18 +52,19 @@ static void DeviceSegmentedRadixSortKeys(CUDAContext const *ctx, void *d_temp_st // Wrapper around cub sort for easier `descending` sort. template -void DeviceSegmentedRadixSortPair(void *d_temp_storage, size_t &temp_storage_bytes, - const KeyT *d_keys_in, // NOLINT - KeyT *d_keys_out, const ValueT *d_values_in, ValueT *d_values_out, - size_t num_items, size_t num_segments, +void DeviceSegmentedRadixSortPair(void *d_temp_storage, + std::size_t &temp_storage_bytes, // NOLINT + const KeyT *d_keys_in, KeyT *d_keys_out, + const ValueT *d_values_in, ValueT *d_values_out, + std::size_t num_items, std::size_t num_segments, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, dh::CUDAStreamView stream, int begin_bit = 0, int end_bit = sizeof(KeyT) * 8) { cub::DoubleBuffer d_keys(const_cast(d_keys_in), d_keys_out); cub::DoubleBuffer d_values(const_cast(d_values_in), d_values_out); // In old version of cub, num_items in dispatch is also int32_t, no way to change. - using OffsetT = - std::conditional_t(), size_t, int32_t>; + using OffsetT = std::conditional_t(), + std::size_t, std::int32_t>; CHECK_LE(num_items, std::numeric_limits::max()); // For Thrust >= 1.12 or CUDA >= 11.4, we require system cub installation @@ -87,7 +94,7 @@ void DeviceSegmentedRadixSortPair(void *d_temp_storage, size_t &temp_storage_byt template void SegmentedSequence(Context const *ctx, Span d_offset_ptr, Span out_sequence) { dh::LaunchN(out_sequence.size(), ctx->CUDACtx()->Stream(), - [out_sequence, d_offset_ptr] __device__(size_t idx) { + [out_sequence, d_offset_ptr] __device__(std::size_t idx) { auto group = dh::SegmentId(d_offset_ptr, idx); out_sequence[idx] = idx - d_offset_ptr[group]; }); @@ -97,8 +104,8 @@ template inline void SegmentedSortKeys(Context const *ctx, Span group_ptr, Span out_sorted_values) { CHECK_GE(group_ptr.size(), 1ul); - size_t n_groups = group_ptr.size() - 1; - size_t bytes = 0; + std::size_t n_groups = group_ptr.size() - 1; + std::size_t bytes = 0; auto const *cuctx = ctx->CUDACtx(); CHECK(cuctx); detail::DeviceSegmentedRadixSortKeys( @@ -121,8 +128,8 @@ template values, Span group_ptr, Span sorted_idx) { CHECK_GE(group_ptr.size(), 1ul); - size_t n_groups = group_ptr.size() - 1; - size_t bytes = 0; + std::size_t n_groups = group_ptr.size() - 1; + std::size_t bytes = 0; if (per_seg_index) { SegmentedSequence(ctx, group_ptr, sorted_idx); } else { @@ -151,15 +158,15 @@ void SegmentedArgSort(Context const *ctx, Span values, Span group_ptr, */ template void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, ValIt val_begin, - ValIt val_end, dh::device_vector *p_sorted_idx) { - using Tup = thrust::tuple; + ValIt val_end, dh::device_vector *p_sorted_idx) { + using Tup = thrust::tuple; auto &sorted_idx = *p_sorted_idx; - size_t n = std::distance(val_begin, val_end); + std::size_t n = std::distance(val_begin, val_end); sorted_idx.resize(n); dh::Iota(dh::ToSpan(sorted_idx)); dh::device_vector keys(sorted_idx.size()); auto key_it = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), - [=] XGBOOST_DEVICE(size_t i) -> Tup { + [=] XGBOOST_DEVICE(std::size_t i) -> Tup { int32_t leaf_idx; if (i < *seg_begin) { leaf_idx = -1; @@ -170,8 +177,6 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V return thrust::make_tuple(leaf_idx, residue); }); thrust::copy(ctx->CUDACtx()->CTP(), key_it, key_it + keys.size(), keys.begin()); - - dh::XGBDeviceAllocator alloc; thrust::stable_sort_by_key(ctx->CUDACtx()->TP(), keys.begin(), keys.end(), sorted_idx.begin(), [=] XGBOOST_DEVICE(Tup const &l, Tup const &r) { if (thrust::get<0>(l) != thrust::get<0>(r)) { @@ -182,3 +187,4 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V } } // namespace common } // namespace xgboost +#endif // XGBOOST_COMMON_ALGORITHM_CUH_ From 793d54e8d61c4220d176dafd5876d846274c69bd Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 13 Feb 2023 16:14:52 +0800 Subject: [PATCH 3/4] rename. --- src/common/algorithm.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh index 10967cf2fe43..53acc65e16e2 100644 --- a/src/common/algorithm.cuh +++ b/src/common/algorithm.cuh @@ -167,14 +167,14 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V dh::device_vector keys(sorted_idx.size()); auto key_it = dh::MakeTransformIterator(thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> Tup { - int32_t leaf_idx; + std::int32_t seg_idx; if (i < *seg_begin) { - leaf_idx = -1; + seg_idx = -1; } else { - leaf_idx = dh::SegmentId(seg_begin, seg_end, i); + seg_idx = dh::SegmentId(seg_begin, seg_end, i); } auto residue = val_begin[i]; - return thrust::make_tuple(leaf_idx, residue); + return thrust::make_tuple(seg_idx, residue); }); thrust::copy(ctx->CUDACtx()->CTP(), key_it, key_it + keys.size(), keys.begin()); thrust::stable_sort_by_key(ctx->CUDACtx()->TP(), keys.begin(), keys.end(), sorted_idx.begin(), From 79d4ce3258da3d599c86048ba440fc1ec43ff463 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 13 Feb 2023 17:22:53 +0800 Subject: [PATCH 4/4] fixes. --- src/metric/auc.cu | 3 +++ src/metric/metric.cc | 1 + tests/cpp/common/test_algorithm.cu | 14 +++++++------- tests/cpp/common/test_quantile.cu | 4 +++- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/metric/auc.cu b/src/metric/auc.cu index a59c14bddf82..ae5ba676ec51 100644 --- a/src/metric/auc.cu +++ b/src/metric/auc.cu @@ -21,6 +21,9 @@ namespace xgboost { namespace metric { +// tag the this file, used by force static link later. +DMLC_REGISTRY_FILE_TAG(auc_gpu); + namespace { // Pair of FP/TP using Pair = thrust::pair; diff --git a/src/metric/metric.cc b/src/metric/metric.cc index 08183cc38e1c..2638c56edecd 100644 --- a/src/metric/metric.cc +++ b/src/metric/metric.cc @@ -84,6 +84,7 @@ DMLC_REGISTRY_LINK_TAG(multiclass_metric); DMLC_REGISTRY_LINK_TAG(survival_metric); DMLC_REGISTRY_LINK_TAG(rank_metric); #ifdef XGBOOST_USE_CUDA +DMLC_REGISTRY_LINK_TAG(auc_gpu); DMLC_REGISTRY_LINK_TAG(rank_metric_gpu); #endif } // namespace metric diff --git a/tests/cpp/common/test_algorithm.cu b/tests/cpp/common/test_algorithm.cu index 8e5a0939867f..c2e159dc401d 100644 --- a/tests/cpp/common/test_algorithm.cu +++ b/tests/cpp/common/test_algorithm.cu @@ -36,18 +36,18 @@ void TestSegmentedArgSort() { dh::device_vector values(kElements, 0.0f); thrust::sequence(values.begin(), values.end(), 0.0f); - SegmentedArgSort(&ctx, dh::ToSpan(values), d_offset_ptr, d_sorted_idx); + SegmentedArgSort(&ctx, dh::ToSpan(values), d_offset_ptr, d_sorted_idx); std::vector h_sorted_index(sorted_idx.size()); thrust::copy(sorted_idx.begin(), sorted_idx.end(), h_sorted_index.begin()); for (size_t i = 1; i < kGroups + 1; ++i) { - auto group_idx = common::Span(h_sorted_index) - .subspan(offset_ptr[i - 1], offset_ptr[i] - offset_ptr[i - 1]); - ASSERT_TRUE(std::is_sorted(group_idx.begin(), group_idx.end(), std::greater<>{})); - ASSERT_EQ(group_idx.back(), 0); - for (auto j : group_idx) { - ASSERT_LT(j, group_idx.size()); + auto group_sorted_idx = common::Span(h_sorted_index) + .subspan(offset_ptr[i - 1], offset_ptr[i] - offset_ptr[i - 1]); + ASSERT_TRUE(std::is_sorted(group_sorted_idx.begin(), group_sorted_idx.end(), std::greater<>{})); + ASSERT_EQ(group_sorted_idx.back(), 0); + for (auto j : group_sorted_idx) { + ASSERT_LT(j, group_sorted_idx.size()); } } } diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu index cb24f8bb4140..f36334bcc794 100644 --- a/tests/cpp/common/test_quantile.cu +++ b/tests/cpp/common/test_quantile.cu @@ -341,6 +341,7 @@ TEST(GPUQuantile, MultiMerge) { namespace { void TestAllReduceBasic(int32_t n_gpus) { auto const world = collective::GetWorldSize(); + CHECK_EQ(world, n_gpus); constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) { auto const device = collective::GetRank(); @@ -425,8 +426,9 @@ TEST(GPUQuantile, MGPUAllReduceBasic) { } namespace { -void TestSameOnAllWorkers(int32_t n_gpus) { +void TestSameOnAllWorkers(std::int32_t n_gpus) { auto world = collective::GetWorldSize(); + CHECK_EQ(world, n_gpus); constexpr size_t kRows = 1000, kCols = 100; RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {