From 66ead8068387a884e0409f0d58c4ef955287f805 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 22 Oct 2024 12:16:18 -0700 Subject: [PATCH] Add condition to ensure value type is trivially copyable to call reduce-then-scan Signed-off-by: Matthew Michel --- .../hetero/algorithm_ranges_impl_hetero.h | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 7179e1f864a..fc04e0296ff 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -944,26 +944,28 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& return 1; } + using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; + using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; + using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; #if _ONEDPL_BACKEND_SYCL // We would normally dispatch to the parallel implementation which would make the decision to invoke // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose // whether or not to use reduce-then-scan here. - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + if constexpr (std::is_trivially_copyable_v<__val_type>) { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), - std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), - __binary_pred, __binary_op); - __res.wait(); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + { + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), + __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + } } #endif - using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; - using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; - using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; - // Round 1: reduce with extra indices added to avoid long segments // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. // Skip a round of copy_if and reduces if there are none.