From 217d0419e1365b9e8eac14ca08768238a9d087ba Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 27 Sep 2024 16:24:34 -0500 Subject: [PATCH 01/34] Initial commit of reduce_by_segment with the reduce-then-scan path Signed-off-by: Matthew Michel --- .../dpl/internal/reduce_by_segment_impl.h | 6 +- .../hetero/algorithm_ranges_impl_hetero.h | 7 ++ .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 60 +++++++++++--- .../parallel_backend_sycl_reduce_then_scan.h | 81 ++++++++++--------- 4 files changed, 103 insertions(+), 51 deletions(-) diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h index d0979554786..a22f2970b49 100644 --- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h +++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h @@ -191,18 +191,19 @@ using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; } // namespace template + typename _Range4, typename _BinaryPredicate, typename _BinaryOperator, typename _KnownIdentity> oneapi::dpl::__internal::__difference_t<_Range3> __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op, - ::std::false_type /* has_known_identity */) + _KnownIdentity) { return oneapi::dpl::experimental::ranges::reduce_by_segment( ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__keys), ::std::forward<_Range2>(__values), ::std::forward<_Range3>(__out_keys), ::std::forward<_Range4>(__out_values), __binary_pred, __binary_op); } +#if 0 template oneapi::dpl::__internal::__difference_t<_Range3> @@ -573,6 +574,7 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy return __end_idx.get_host_access()[0] + 1; } +#endif template diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index b9cd154a044..f845c209cfe 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -913,6 +913,12 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { + oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op) + .wait(); + return 1; + #if 0 // The algorithm reduces values in __values where the // associated keys for the values are equal to the adjacent key. // @@ -1043,6 +1049,7 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& .__deferrable_wait(); return __result_end; + #endif } } // namespace __ranges diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 68dd00188dd..5e455b607fc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -775,9 +775,9 @@ __group_scan_fits_in_slm(const sycl::queue& __queue, std::size_t __n, std::size_ template struct __gen_transform_input { - template + template auto - operator()(const _InRng& __in_rng, std::size_t __id) const + operator()(std::size_t __id, const _InRng& __in_rng, _OutRng&) const { // We explicitly convert __in_rng[__id] to the value type of _InRng to properly handle the case where we // process zip_iterator input where the reference type is a tuple of a references. This prevents the caller @@ -790,9 +790,9 @@ struct __gen_transform_input struct __simple_write_to_id { - template + template void - operator()(_OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const + operator()(std::size_t __id, const _ValueType& __v, _InRng&, _OutRng& __out_rng) const { // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through. @@ -806,9 +806,9 @@ struct __simple_write_to_id template struct __gen_mask { - template + template bool - operator()(_InRng&& __in_rng, std::size_t __id) const + operator()(std::size_t __id, const _InRng& __in_rng, _OutRng&) const { return __pred((__rng_transform(std::forward<_InRng>(__in_rng)))[__id]); } @@ -1014,9 +1014,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _GenInput __gen_transform{__unary_op}; return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, - _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, + _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}, __n, + std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng)); } } @@ -1188,6 +1188,48 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t } } +template +auto +__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, + _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +{ + auto __n = __keys.size(); + auto __gen_reduce_input = [=](std::size_t __idx, const auto& __in_keys, const auto& __in_vals, const auto&, const auto&) { + using _ValueType = oneapi::dpl::__internal::__value_t; + if (__idx == 0) + return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); + if (!__binary_pred(__in_keys[__idx], __in_keys[__idx - 1])) + return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); + return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); + }; + auto __reduce_op = [=](const auto& __lhs_tup, const auto& __rhs_tup) { + if (std::get<0>(__rhs_tup) == 0) + { + return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup), + __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup))); + } + return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup), + std::get<1>(__rhs_tup)); + }; + auto __gen_scan_input = __gen_reduce_input; + auto __scan_input_transform = oneapi::dpl::__internal::__no_op{}; + auto __write_out = [=](std::size_t __idx, const auto& __tup, const auto& __in_keys, const auto&, auto& __out_keys, auto& __out_values) { + // Will be present in L1 cache + if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) + { + __out_keys[std::get<0>(__tup)] = __in_keys[__idx]; + __out_values[std::get<0>(__tup)] = std::get<1>(__tup); + } + }; + using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + return __parallel_transform_reduce_then_scan( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, + __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}, __n, + std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)); +} + template auto __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 8c0762f2a38..d5b44d1abcc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -153,27 +153,27 @@ __sub_group_scan_partial(const __dpl_sycl::__sub_group& __sub_group, _ValueType& template + typename _WriteOp, typename _LazyValueType, typename... _Rngs> void __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenInput __gen_input, _ScanInputTransform __scan_input_transform, _BinaryOp __binary_op, _WriteOp __write_op, - _LazyValueType& __sub_group_carry, const _InRng& __in_rng, _OutRng& __out_rng, + _LazyValueType& __sub_group_carry, std::size_t __start_id, std::size_t __n, std::uint32_t __iters_per_item, std::size_t __subgroup_start_id, std::uint32_t __sub_group_id, - std::uint32_t __active_subgroups) + std::uint32_t __active_subgroups, _Rngs&&... __rngs) { - using _GenInputType = std::invoke_result_t<_GenInput, _InRng, std::size_t>; + using _GenInputType = std::invoke_result_t<_GenInput, std::size_t, _Rngs...>; bool __is_full_block = (__iters_per_item == __max_inputs_per_item); bool __is_full_thread = __subgroup_start_id + __iters_per_item * __sub_group_size <= __n; if (__is_full_thread) { - _GenInputType __v = __gen_input(__in_rng, __start_id); + _GenInputType __v = __gen_input(__start_id, __rngs...); __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__out_rng, __start_id, __v); + __write_op(__start_id, __v, __rngs...); } if (__is_full_block) @@ -182,12 +182,12 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI _ONEDPL_PRAGMA_UNROLL for (std::uint32_t __j = 1; __j < __max_inputs_per_item; __j++) { - __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size); + __v = __gen_input(__start_id + __j * __sub_group_size, __rngs...); __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__out_rng, __start_id + __j * __sub_group_size, __v); + __write_op(__start_id + __j * __sub_group_size, __v, __rngs...); } } } @@ -197,12 +197,12 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI // can proceed without special casing for partial subgroups. for (std::uint32_t __j = 1; __j < __iters_per_item; __j++) { - __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size); + __v = __gen_input(__start_id + __j * __sub_group_size, __rngs...); __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__out_rng, __start_id + __j * __sub_group_size, __v); + __write_op(__start_id + __j * __sub_group_size, __v, __rngs...); } } } @@ -218,48 +218,48 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI if (__iters == 1) { std::size_t __local_id = (__start_id < __n) ? __start_id : __n - 1; - _GenInputType __v = __gen_input(__in_rng, __local_id); + _GenInputType __v = __gen_input(__local_id, __rngs...); __sub_group_scan_partial<__sub_group_size, __is_inclusive, __init_present>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry, __n - __subgroup_start_id); if constexpr (__capture_output) { if (__start_id < __n) - __write_op(__out_rng, __start_id, __v); + __write_op(__start_id, __v, __rngs...); } } else { - _GenInputType __v = __gen_input(__in_rng, __start_id); + _GenInputType __v = __gen_input(__start_id, __rngs...); __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__out_rng, __start_id, __v); + __write_op(__start_id, __v, __rngs...); } for (std::uint32_t __j = 1; __j < __iters - 1; __j++) { std::size_t __local_id = __start_id + __j * __sub_group_size; - __v = __gen_input(__in_rng, __local_id); + __v = __gen_input(__local_id, __rngs...); __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__out_rng, __local_id, __v); + __write_op(__local_id, __v, __rngs...); } } std::size_t __offset = __start_id + (__iters - 1) * __sub_group_size; std::size_t __local_id = (__offset < __n) ? __offset : __n - 1; - __v = __gen_input(__in_rng, __local_id); + __v = __gen_input(__local_id, __rngs...); __sub_group_scan_partial<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry, __n - (__subgroup_start_id + (__iters - 1) * __sub_group_size)); if constexpr (__capture_output) { if (__offset < __n) - __write_op(__out_rng, __offset, __v); + __write_op(__offset, __v, __rngs...); } } } @@ -286,12 +286,12 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu { // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory // input buffer - template + template sycl::event - operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, + operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num) const + const std::size_t __block_num, _Rngs&&... __rngs) const { using _InitValueType = typename _InitType::__value_type; return __exec.queue().submit([&, this](sycl::handler& __cgh) { @@ -332,8 +332,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu /*__init_present=*/false, /*__capture_output=*/false, __max_inputs_per_item>( __sub_group, __gen_reduce_input, oneapi::dpl::__internal::__no_op{}, __reduce_op, nullptr, - __sub_group_carry, __in_rng, /*unused*/ __in_rng, __start_id, __n, __inputs_per_item, - __subgroup_start_id, __sub_group_id, __active_subgroups); + __sub_group_carry, __start_id, __n, __inputs_per_item, + __subgroup_start_id, __sub_group_id, __active_subgroups, __rngs...); if (__sub_group_local_id == 0) __sub_group_partials[__sub_group_id] = __sub_group_carry.__v; __sub_group_carry.__destroy(); @@ -437,12 +437,12 @@ struct __parallel_reduce_then_scan_scan_submitter< __tmp_ptr[__num_sub_groups_global + 1 - (__block_num % 2)] = __block_carry_out; } - template + template sycl::event - operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng, + operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num) const + const std::size_t __block_num, _Rngs&&... __rngs) const { std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size}); std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div( @@ -626,7 +626,7 @@ struct __parallel_reduce_then_scan_scan_submitter< if (__sub_group_local_id == 0) { // For unique patterns, always copy the 0th element to the output - __write_op.__assign(__in_rng[0], __out_rng[0]); + //__write_op.__assign(__in_rng[0], __out_rng[0]); } } @@ -672,8 +672,8 @@ struct __parallel_reduce_then_scan_scan_submitter< /*__init_present=*/true, /*__capture_output=*/true, __max_inputs_per_item>( __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op, - __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id, - __sub_group_id, __active_subgroups); + __sub_group_carry, __start_id, __n, __inputs_per_item, __subgroup_start_id, + __sub_group_id, __active_subgroups, __rngs...); } else // first group first block, no subgroup carry { @@ -681,8 +681,8 @@ struct __parallel_reduce_then_scan_scan_submitter< /*__init_present=*/false, /*__capture_output=*/true, __max_inputs_per_item>( __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op, - __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id, - __sub_group_id, __active_subgroups); + __sub_group_carry, __start_id, __n, __inputs_per_item, __subgroup_start_id, + __sub_group_id, __active_subgroups, __rngs...); } // If within the last active group and sub-group of the block, use the 0th work-item of the sub-group // to write out the last carry out for either the return value or the next block @@ -746,15 +746,16 @@ __is_gpu_with_sg_32(const _ExecutionPolicy& __exec) // _ReduceOp - a binary function which is used in the reduction and scan operations // _WriteOp - a function which accepts output range, index, and output of `_GenScanInput` applied to the input range // and performs the final write to output operation -template + typename _Inclusive, typename _IsUniquePattern, typename... _Rngs> auto __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input, + _GenReduceInput __gen_reduce_input, _ReduceOp __reduce_op, _GenScanInput __gen_scan_input, _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init, - _Inclusive, _IsUniquePattern) + _Inclusive, _IsUniquePattern, + std::size_t __n, _Rngs&&... __rngs) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< @@ -780,7 +781,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ const std::uint32_t __num_work_items = __num_work_groups * __work_group_size; const std::uint32_t __num_sub_groups_local = __work_group_size / __sub_group_size; const std::uint32_t __num_sub_groups_global = __num_sub_groups_local * __num_work_groups; - const std::size_t __n = __in_rng.size(); + //const std::size_t __n = __in_rng.size(); const std::uint32_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups; std::size_t __inputs_remaining = __n; if constexpr (__is_unique_pattern_v) @@ -848,11 +849,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ auto __local_range = sycl::range<1>(__work_group_size); auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range); // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory. - __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b); + __event = __reduce_submitter(__exec, __kernel_nd_range, __result_and_scratch, __event, + __inputs_per_sub_group, __inputs_per_item, __b, __rngs...); // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan. - __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b); + __event = __scan_submitter(__exec, __kernel_nd_range, __result_and_scratch, __event, + __inputs_per_sub_group, __inputs_per_item, __b, __rngs...); __inputs_remaining -= std::min(__inputs_remaining, __block_size); // We only need to resize these parameters prior to the last block as it is the only non-full case. if (__b + 2 == __num_blocks) From 7a64f8fe1f06fb620c13cfbcafd820cabca95e37 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 14 Oct 2024 13:36:40 -0700 Subject: [PATCH 02/34] Revert change to ranges and use zip_view over segments / values instead Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 47 ++++++----- .../parallel_backend_sycl_reduce_then_scan.h | 81 +++++++++---------- 2 files changed, 67 insertions(+), 61 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 5e455b607fc..57d42747135 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -775,9 +775,9 @@ __group_scan_fits_in_slm(const sycl::queue& __queue, std::size_t __n, std::size_ template struct __gen_transform_input { - template + template auto - operator()(std::size_t __id, const _InRng& __in_rng, _OutRng&) const + operator()(const _InRng& __in_rng, std::size_t __id) const { // We explicitly convert __in_rng[__id] to the value type of _InRng to properly handle the case where we // process zip_iterator input where the reference type is a tuple of a references. This prevents the caller @@ -790,9 +790,9 @@ struct __gen_transform_input struct __simple_write_to_id { - template + template void - operator()(std::size_t __id, const _ValueType& __v, _InRng&, _OutRng& __out_rng) const + operator()(const _InRng&, _OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const { // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through. @@ -806,9 +806,9 @@ struct __simple_write_to_id template struct __gen_mask { - template + template bool - operator()(std::size_t __id, const _InRng& __in_rng, _OutRng&) const + operator()(const _InRng& __in_rng, std::size_t __id) const { return __pred((__rng_transform(std::forward<_InRng>(__in_rng)))[__id]); } @@ -939,9 +939,9 @@ struct __get_zeroth_element template struct __write_to_id_if { - template + template void - operator()(_OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const + operator()(const _InRng&, _OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const { // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through. @@ -957,9 +957,9 @@ struct __write_to_id_if template struct __write_to_id_if_else { - template + template void - operator()(_OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const + operator()(const _InRng&, _OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const { using _ConvertedTupleType = typename oneapi::dpl::__internal::__get_tuple_type(__v))>, @@ -1014,9 +1014,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _GenInput __gen_transform{__unary_op}; return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, - _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}, __n, - std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng)); + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), + std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, + _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); } } @@ -1187,7 +1187,6 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t _CopyOp{_ReduceOp{}, _Assign{}}); } } - template auto @@ -1196,7 +1195,9 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { auto __n = __keys.size(); - auto __gen_reduce_input = [=](std::size_t __idx, const auto& __in_keys, const auto& __in_vals, const auto&, const auto&) { + auto __gen_reduce_input = [=](const auto& __in_rng, std::size_t __idx) { + auto&& __in_keys = std::get<0>(__in_rng.tuple()); + auto&& __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; if (__idx == 0) return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); @@ -1215,8 +1216,11 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac }; auto __gen_scan_input = __gen_reduce_input; auto __scan_input_transform = oneapi::dpl::__internal::__no_op{}; - auto __write_out = [=](std::size_t __idx, const auto& __tup, const auto& __in_keys, const auto&, auto& __out_keys, auto& __out_values) { - // Will be present in L1 cache + auto __write_out = [=](auto& __in_rng, auto& __out_rng, std::size_t __idx, const auto& __tup) { + auto&& __in_keys = std::get<0>(__in_rng.tuple()); + auto&& __out_keys = std::get<0>(__out_rng.tuple()); + auto&& __out_vals = std::get<1>(__out_rng.tuple()); + // Assuming this will be present in L1 cache if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) { __out_keys[std::get<0>(__tup)] = __in_keys[__idx]; @@ -1225,9 +1229,12 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac }; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, - __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}, __n, - std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)); + __backend_tag, std::forward<_ExecutionPolicy>(__exec), + oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), + oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), + __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, + __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{} + ); } template diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index d5b44d1abcc..09c0d754bd7 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -153,27 +153,27 @@ __sub_group_scan_partial(const __dpl_sycl::__sub_group& __sub_group, _ValueType& template + typename _WriteOp, typename _LazyValueType, typename _InRng, typename _OutRng> void __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenInput __gen_input, _ScanInputTransform __scan_input_transform, _BinaryOp __binary_op, _WriteOp __write_op, - _LazyValueType& __sub_group_carry, + _LazyValueType& __sub_group_carry, const _InRng& __in_rng, _OutRng& __out_rng, std::size_t __start_id, std::size_t __n, std::uint32_t __iters_per_item, std::size_t __subgroup_start_id, std::uint32_t __sub_group_id, - std::uint32_t __active_subgroups, _Rngs&&... __rngs) + std::uint32_t __active_subgroups) { - using _GenInputType = std::invoke_result_t<_GenInput, std::size_t, _Rngs...>; + using _GenInputType = std::invoke_result_t<_GenInput, _InRng, std::size_t>; bool __is_full_block = (__iters_per_item == __max_inputs_per_item); bool __is_full_thread = __subgroup_start_id + __iters_per_item * __sub_group_size <= __n; if (__is_full_thread) { - _GenInputType __v = __gen_input(__start_id, __rngs...); + _GenInputType __v = __gen_input(__in_rng, __start_id); __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>(__sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__start_id, __v, __rngs...); + __write_op(__in_rng, __out_rng, __start_id, __v); } if (__is_full_block) @@ -182,12 +182,12 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI _ONEDPL_PRAGMA_UNROLL for (std::uint32_t __j = 1; __j < __max_inputs_per_item; __j++) { - __v = __gen_input(__start_id + __j * __sub_group_size, __rngs...); + __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size); __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__start_id + __j * __sub_group_size, __v, __rngs...); + __write_op(__in_rng, __out_rng, __start_id + __j * __sub_group_size, __v); } } } @@ -197,12 +197,12 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI // can proceed without special casing for partial subgroups. for (std::uint32_t __j = 1; __j < __iters_per_item; __j++) { - __v = __gen_input(__start_id + __j * __sub_group_size, __rngs...); + __v = __gen_input(__in_rng, __start_id + __j * __sub_group_size); __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__start_id + __j * __sub_group_size, __v, __rngs...); + __write_op(__in_rng, __out_rng, __start_id + __j * __sub_group_size, __v); } } } @@ -218,48 +218,48 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI if (__iters == 1) { std::size_t __local_id = (__start_id < __n) ? __start_id : __n - 1; - _GenInputType __v = __gen_input(__local_id, __rngs...); + _GenInputType __v = __gen_input(__in_rng, __local_id); __sub_group_scan_partial<__sub_group_size, __is_inclusive, __init_present>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry, __n - __subgroup_start_id); if constexpr (__capture_output) { if (__start_id < __n) - __write_op(__start_id, __v, __rngs...); + __write_op(__in_rng, __out_rng, __start_id, __v); } } else { - _GenInputType __v = __gen_input(__start_id, __rngs...); + _GenInputType __v = __gen_input(__in_rng, __start_id); __sub_group_scan<__sub_group_size, __is_inclusive, __init_present>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__start_id, __v, __rngs...); + __write_op(__in_rng, __out_rng, __start_id, __v); } for (std::uint32_t __j = 1; __j < __iters - 1; __j++) { std::size_t __local_id = __start_id + __j * __sub_group_size; - __v = __gen_input(__local_id, __rngs...); + __v = __gen_input(__in_rng, __local_id); __sub_group_scan<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__local_id, __v, __rngs...); + __write_op(__in_rng, __out_rng, __local_id, __v); } } std::size_t __offset = __start_id + (__iters - 1) * __sub_group_size; std::size_t __local_id = (__offset < __n) ? __offset : __n - 1; - __v = __gen_input(__local_id, __rngs...); + __v = __gen_input(__in_rng, __local_id); __sub_group_scan_partial<__sub_group_size, __is_inclusive, /*__init_present=*/true>( __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry, __n - (__subgroup_start_id + (__iters - 1) * __sub_group_size)); if constexpr (__capture_output) { if (__offset < __n) - __write_op(__offset, __v, __rngs...); + __write_op(__in_rng, __out_rng, __offset, __v); } } } @@ -286,12 +286,12 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu { // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory // input buffer - template + template sycl::event - operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, + operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num, _Rngs&&... __rngs) const + const std::size_t __block_num) const { using _InitValueType = typename _InitType::__value_type; return __exec.queue().submit([&, this](sycl::handler& __cgh) { @@ -332,8 +332,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu /*__init_present=*/false, /*__capture_output=*/false, __max_inputs_per_item>( __sub_group, __gen_reduce_input, oneapi::dpl::__internal::__no_op{}, __reduce_op, nullptr, - __sub_group_carry, __start_id, __n, __inputs_per_item, - __subgroup_start_id, __sub_group_id, __active_subgroups, __rngs...); + __sub_group_carry, __in_rng, /*unused*/ __in_rng, __start_id, __n, __inputs_per_item, + __subgroup_start_id, __sub_group_id, __active_subgroups); if (__sub_group_local_id == 0) __sub_group_partials[__sub_group_id] = __sub_group_carry.__v; __sub_group_carry.__destroy(); @@ -437,12 +437,12 @@ struct __parallel_reduce_then_scan_scan_submitter< __tmp_ptr[__num_sub_groups_global + 1 - (__block_num % 2)] = __block_carry_out; } - template + template sycl::event - operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, + operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num, _Rngs&&... __rngs) const + const std::size_t __block_num) const { std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size}); std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div( @@ -626,7 +626,7 @@ struct __parallel_reduce_then_scan_scan_submitter< if (__sub_group_local_id == 0) { // For unique patterns, always copy the 0th element to the output - //__write_op.__assign(__in_rng[0], __out_rng[0]); + __write_op.__assign(__in_rng[0], __out_rng[0]); } } @@ -672,8 +672,8 @@ struct __parallel_reduce_then_scan_scan_submitter< /*__init_present=*/true, /*__capture_output=*/true, __max_inputs_per_item>( __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op, - __sub_group_carry, __start_id, __n, __inputs_per_item, __subgroup_start_id, - __sub_group_id, __active_subgroups, __rngs...); + __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id, + __sub_group_id, __active_subgroups); } else // first group first block, no subgroup carry { @@ -681,8 +681,8 @@ struct __parallel_reduce_then_scan_scan_submitter< /*__init_present=*/false, /*__capture_output=*/true, __max_inputs_per_item>( __sub_group, __gen_scan_input, __scan_input_transform, __reduce_op, __write_op, - __sub_group_carry, __start_id, __n, __inputs_per_item, __subgroup_start_id, - __sub_group_id, __active_subgroups, __rngs...); + __sub_group_carry, __in_rng, __out_rng, __start_id, __n, __inputs_per_item, __subgroup_start_id, + __sub_group_id, __active_subgroups); } // If within the last active group and sub-group of the block, use the 0th work-item of the sub-group // to write out the last carry out for either the return value or the next block @@ -746,16 +746,15 @@ __is_gpu_with_sg_32(const _ExecutionPolicy& __exec) // _ReduceOp - a binary function which is used in the reduction and scan operations // _WriteOp - a function which accepts output range, index, and output of `_GenScanInput` applied to the input range // and performs the final write to output operation -template + typename _Inclusive, typename _IsUniquePattern> auto __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, - _GenReduceInput __gen_reduce_input, + _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input, _ReduceOp __reduce_op, _GenScanInput __gen_scan_input, _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init, - _Inclusive, _IsUniquePattern, - std::size_t __n, _Rngs&&... __rngs) + _Inclusive, _IsUniquePattern) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< @@ -781,7 +780,7 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ const std::uint32_t __num_work_items = __num_work_groups * __work_group_size; const std::uint32_t __num_sub_groups_local = __work_group_size / __sub_group_size; const std::uint32_t __num_sub_groups_global = __num_sub_groups_local * __num_work_groups; - //const std::size_t __n = __in_rng.size(); + const std::size_t __n = __in_rng.size(); const std::uint32_t __max_inputs_per_block = __work_group_size * __max_inputs_per_item * __num_work_groups; std::size_t __inputs_remaining = __n; if constexpr (__is_unique_pattern_v) @@ -849,11 +848,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ auto __local_range = sycl::range<1>(__work_group_size); auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range); // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory. - __event = __reduce_submitter(__exec, __kernel_nd_range, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b, __rngs...); + __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event, + __inputs_per_sub_group, __inputs_per_item, __b); // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan. - __event = __scan_submitter(__exec, __kernel_nd_range, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b, __rngs...); + __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event, + __inputs_per_sub_group, __inputs_per_item, __b); __inputs_remaining -= std::min(__inputs_remaining, __block_size); // We only need to resize these parameters prior to the last block as it is the only non-full case. if (__b + 2 == __num_blocks) From ce1d495935685fbe625dc355a8d6bb9b5545f55f Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 06:42:29 -0700 Subject: [PATCH 03/34] Implement correct return for reduce_by_segment Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/algorithm_ranges_impl_hetero.h | 13 ++++++++----- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 10 ++++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index f845c209cfe..7558e29b4d5 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -913,11 +913,14 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { - oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), - std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), - std::forward<_Range4>(__out_values), __binary_pred, __binary_op) - .wait(); - return 1; + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + // TODO: this needs to be enabled if reduce then scan cannot be satisfied. #if 0 // The algorithm reduces values in __values where the // associated keys for the values are equal to the adjacent key. diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 57d42747135..fccbf606ec3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1199,11 +1199,9 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; - if (__idx == 0) + if (__idx == 0 || __binary_pred(__in_keys[__idx], __in_keys[__idx - 1])) return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); - if (!__binary_pred(__in_keys[__idx], __in_keys[__idx - 1])) - return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); - return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); + return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); }; auto __reduce_op = [=](const auto& __lhs_tup, const auto& __rhs_tup) { if (std::get<0>(__rhs_tup) == 0) @@ -1233,8 +1231,8 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, - __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{} - ); + __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, + /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } template From f36b0a03f23b836dd16647dc381454f882790c10 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 07:29:49 -0700 Subject: [PATCH 04/34] Add support for flag predicates Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index fccbf606ec3..d78df4159e6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1187,6 +1187,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t _CopyOp{_ReduceOp{}, _Assign{}}); } } + template auto @@ -1198,19 +1199,20 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac auto __gen_reduce_input = [=](const auto& __in_rng, std::size_t __idx) { auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __in_vals = std::get<1>(__in_rng.tuple()); + using _KeyType = oneapi::dpl::__internal::__value_t; using _ValueType = oneapi::dpl::__internal::__value_t; - if (__idx == 0 || __binary_pred(__in_keys[__idx], __in_keys[__idx - 1])) - return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); - return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); + if (__idx == 0 || __binary_pred(__in_keys[__idx - 1], __in_keys[__idx])) + return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}, _KeyType{__in_keys[__idx]}); + return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}, _KeyType{__in_keys[__idx]}); }; auto __reduce_op = [=](const auto& __lhs_tup, const auto& __rhs_tup) { if (std::get<0>(__rhs_tup) == 0) { return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup), - __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup))); + __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup)), std::get<2>(__lhs_tup)); } return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup), - std::get<1>(__rhs_tup)); + std::get<1>(__rhs_tup), std::get<2>(__rhs_tup)); }; auto __gen_scan_input = __gen_reduce_input; auto __scan_input_transform = oneapi::dpl::__internal::__no_op{}; @@ -1221,17 +1223,18 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac // Assuming this will be present in L1 cache if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) { - __out_keys[std::get<0>(__tup)] = __in_keys[__idx]; + __out_keys[std::get<0>(__tup)] = std::get<2>(__tup); __out_values[std::get<0>(__tup)] = std::get<1>(__tup); } }; + using _KeyType = oneapi::dpl::__internal::__value_t<_Range1>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, - __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, + __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } From c8d667cb0c5b0a5983a494c6531f16775d4243f5 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 08:16:44 -0700 Subject: [PATCH 05/34] Revert "Add support for flag predicates" This reverts commit 0e0d50e8eea85685d46ccbb32b74f1211bfbab20. --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d78df4159e6..fccbf606ec3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1187,7 +1187,6 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t _CopyOp{_ReduceOp{}, _Assign{}}); } } - template auto @@ -1199,20 +1198,19 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac auto __gen_reduce_input = [=](const auto& __in_rng, std::size_t __idx) { auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __in_vals = std::get<1>(__in_rng.tuple()); - using _KeyType = oneapi::dpl::__internal::__value_t; using _ValueType = oneapi::dpl::__internal::__value_t; - if (__idx == 0 || __binary_pred(__in_keys[__idx - 1], __in_keys[__idx])) - return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}, _KeyType{__in_keys[__idx]}); - return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}, _KeyType{__in_keys[__idx]}); + if (__idx == 0 || __binary_pred(__in_keys[__idx], __in_keys[__idx - 1])) + return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); + return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); }; auto __reduce_op = [=](const auto& __lhs_tup, const auto& __rhs_tup) { if (std::get<0>(__rhs_tup) == 0) { return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup), - __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup)), std::get<2>(__lhs_tup)); + __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup))); } return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup), - std::get<1>(__rhs_tup), std::get<2>(__rhs_tup)); + std::get<1>(__rhs_tup)); }; auto __gen_scan_input = __gen_reduce_input; auto __scan_input_transform = oneapi::dpl::__internal::__no_op{}; @@ -1223,18 +1221,17 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac // Assuming this will be present in L1 cache if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) { - __out_keys[std::get<0>(__tup)] = std::get<2>(__tup); + __out_keys[std::get<0>(__tup)] = __in_keys[__idx]; __out_values[std::get<0>(__tup)] = std::get<1>(__tup); } }; - using _KeyType = oneapi::dpl::__internal::__value_t<_Range1>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, - __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, + __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } From 0078cbd543e005de6aab33d465f7138be66ad1c4 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 08:37:53 -0700 Subject: [PATCH 06/34] Re-implement support for flag predicates in a more performant manner Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index fccbf606ec3..caaa1a74695 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1199,7 +1199,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; - if (__idx == 0 || __binary_pred(__in_keys[__idx], __in_keys[__idx - 1])) + if (__idx == 0 || __binary_pred(__in_keys[__idx - 1], __in_keys[__idx])) return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); }; @@ -1218,10 +1218,16 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __out_keys = std::get<0>(__out_rng.tuple()); auto&& __out_vals = std::get<1>(__out_rng.tuple()); - // Assuming this will be present in L1 cache - if (__idx == __n - 1 || !__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) + // TODO: substantial improvement expected with special handling in kernel + // The first key must be output to __out_keys[__idx] for a segment, so when we encounter a segment end we + // must output the current segment's value and the next segment's key. + if (__idx == 0) + __out_keys[0] = __in_keys[0]; + if (__idx == __n - 1) + __out_values[std::get<0>(__tup)] = std::get<1>(__tup); + else if (!__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) { - __out_keys[std::get<0>(__tup)] = __in_keys[__idx]; + __out_keys[std::get<0>(__tup) + 1] = __in_keys[__idx + 1]; __out_values[std::get<0>(__tup)] = std::get<1>(__tup); } }; From 5c2bb40ffce0b3a983a83780a8b514afc6b72681 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 09:51:35 -0700 Subject: [PATCH 07/34] Add fallback and remove old SYCL implementation Signed-off-by: Matthew Michel --- .../dpl/internal/reduce_by_segment_impl.h | 388 +----------------- .../hetero/algorithm_ranges_impl_hetero.h | 26 +- 2 files changed, 23 insertions(+), 391 deletions(-) diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h index a22f2970b49..23259e5e550 100644 --- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h +++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h @@ -191,391 +191,17 @@ using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; } // namespace template + typename _Range4, typename _BinaryPredicate, typename _BinaryOperator> oneapi::dpl::__internal::__difference_t<_Range3> -__sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op, - _KnownIdentity) +__pattern_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys, + _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { return oneapi::dpl::experimental::ranges::reduce_by_segment( ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__keys), ::std::forward<_Range2>(__values), ::std::forward<_Range3>(__out_keys), ::std::forward<_Range4>(__out_values), __binary_pred, __binary_op); } -#if 0 -template -oneapi::dpl::__internal::__difference_t<_Range3> -__sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op, - ::std::true_type /* has_known_identity */) -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - - using _SegReduceCountKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegReduceCountPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, - _BinaryOperator>; - using _SegReduceOffsetKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegReduceOffsetPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, - _BinaryOperator>; - using _SegReduceWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegReduceWgPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, - _BinaryOperator>; - using _SegReducePrefixKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - _SegReducePrefixPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, - _BinaryOperator>; - - using __diff_type = oneapi::dpl::__internal::__difference_t<_Range3>; - using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; - using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; - - const ::std::size_t __n = __keys.size(); - - constexpr ::std::uint16_t __vals_per_item = - 16; // Each work item serially processes 16 items. Best observed performance on gpu - - // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. - // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. - std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)2048); - - // adjust __wgroup_size according to local memory limit. Double the requirement on __val_type due to sycl group algorithm's use - // of SLM. - __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size( - __exec, sizeof(__key_type) + 2 * sizeof(__val_type), __wgroup_size); - -#if _ONEDPL_COMPILE_KERNEL - auto __seg_reduce_count_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegReduceCountKernel>::__compile(__exec); - auto __seg_reduce_offset_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegReduceOffsetKernel>::__compile(__exec); - auto __seg_reduce_wg_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegReduceWgKernel>::__compile(__exec); - auto __seg_reduce_prefix_kernel = - __par_backend_hetero::__internal::__kernel_compiler<_SegReducePrefixKernel>::__compile(__exec); - __wgroup_size = - ::std::min({__wgroup_size, - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_count_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_offset_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_wg_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_prefix_kernel)}); -#endif - - ::std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __wgroup_size * __vals_per_item); - - // intermediate reductions within a workgroup - auto __partials = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n_groups).get_buffer(); - - auto __end_idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, 1).get_buffer(); - - // the number of segment ends found in each work group - auto __seg_ends = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n_groups).get_buffer(); - - // buffer that stores an exclusive scan of the results - auto __seg_ends_scanned = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n_groups).get_buffer(); - - // 1. Count the segment ends in each workgroup - auto __seg_end_identification = __exec.queue().submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __keys); - auto __seg_ends_acc = __seg_ends.template get_access(__cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT - __cgh.use_kernel_bundle(__seg_reduce_count_kernel.get_kernel_bundle()); -#endif - __cgh.parallel_for<_SegReduceCountKernel>( - sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=]( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT - __seg_reduce_count_kernel, -#endif - sycl::nd_item<1> __item) { - auto __group = __item.get_group(); - ::std::size_t __group_id = __item.get_group(0); - ::std::size_t __local_id = __item.get_local_id(0); - ::std::size_t __global_id = __item.get_global_id(0); - - ::std::size_t __start = __global_id * __vals_per_item; - ::std::size_t __end = __dpl_sycl::__minimum{}(__start + __vals_per_item, __n); - ::std::size_t __item_segments = 0; - - // 1a. Work item scan to identify segment ends - for (::std::size_t __i = __start; __i < __end; ++__i) - if (__n - 1 == __i || !__binary_pred(__keys[__i], __keys[__i + 1])) - ++__item_segments; - - // 1b. Work group reduction - ::std::size_t __num_segs = __dpl_sycl::__reduce_over_group( - __group, __item_segments, __dpl_sycl::__plus()); - - // 1c. First work item writes segment count to global memory - if (__local_id == 0) - __seg_ends_acc[__group_id] = __num_segs; - }); - }); - - // 1.5 Small single-group kernel - auto __single_group_scan = __exec.queue().submit([&](sycl::handler& __cgh) { - __cgh.depends_on(__seg_end_identification); - auto __seg_ends_acc = __seg_ends.template get_access(__cgh); - auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT - __cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle()); -#endif - __cgh.parallel_for<_SegReduceOffsetKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT - __seg_reduce_offset_kernel, -#endif - sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { - auto __beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_acc); - auto __out_beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_scan_acc); - __dpl_sycl::__joint_exclusive_scan(__item.get_group(), __beg, __beg + __n_groups, __out_beg, - __diff_type(0), sycl::plus<__diff_type>()); - }); - }); - - // 2. Work group reduction - auto __wg_reduce = __exec.queue().submit([&](sycl::handler& __cgh) { - __cgh.depends_on(__single_group_scan); - oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values, __values); - - auto __partials_acc = __partials.template get_access(__cgh); - auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); - __dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT - __cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle()); -#endif - __cgh.parallel_for<_SegReduceWgKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT - __seg_reduce_wg_kernel, -#endif - sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { - ::std::array<__val_type, __vals_per_item> __loc_partials; - - auto __group = __item.get_group(); - ::std::size_t __group_id = __item.get_group(0); - ::std::size_t __local_id = __item.get_local_id(0); - ::std::size_t __global_id = __item.get_global_id(0); - - // 2a. Lookup the number of prior segs - auto __wg_num_prior_segs = __seg_ends_scan_acc[__group_id]; - - // 2b. Perform a serial scan within the work item over assigned elements. Store partial - // reductions in work group local memory. - ::std::size_t __start = __global_id * __vals_per_item; - ::std::size_t __end = __dpl_sycl::__minimum{}(__start + __vals_per_item, __n); - - ::std::size_t __max_end = 0; - ::std::size_t __item_segments = 0; - auto __identity = unseq_backend::__known_identity<_BinaryOperator, __val_type>; - - __val_type __accumulator = __identity; - for (::std::size_t __i = __start; __i < __end; ++__i) - { - __accumulator = __binary_op(__accumulator, __values[__i]); - if (__n - 1 == __i || !__binary_pred(__keys[__i], __keys[__i + 1])) - { - __loc_partials[__i - __start] = __accumulator; - ++__item_segments; - __max_end = __local_id; - __accumulator = __identity; - } - } - - // 2c. Count the number of prior work segments cooperatively over group - ::std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group( - __group, __item_segments, __dpl_sycl::__plus()); - ::std::size_t __start_idx = __wg_num_prior_segs + __prior_segs_in_wg; - - // 2d. Find the greatest segment end less than the current index (inclusive) - ::std::size_t __closest_seg_id = __dpl_sycl::__inclusive_scan_over_group( - __group, __max_end, __dpl_sycl::__maximum()); - - // __wg_segmented_scan is a derivative work and responsible for the third header copyright - __val_type __carry_in = oneapi::dpl::internal::__wg_segmented_scan( - __item, __loc_acc, __local_id, __local_id - __closest_seg_id, __accumulator, __identity, - __binary_op, __wgroup_size); - - // 2e. Update local partial reductions in first segment and write to global memory. - bool __apply_aggs = true; - ::std::size_t __item_offset = 0; - - // first item in group does not have any work-group aggregates to apply - if (__local_id == 0) - { - __apply_aggs = false; - if (__global_id == 0 && __n > 0) - { - // first segment identifier is always the first key - __out_keys[0] = __keys[0]; - } - } - - // apply the aggregates and copy the locally stored values to destination buffer - for (::std::size_t __i = __start; __i < __end; ++__i) - { - if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1])) - { - ::std::size_t __idx = __start_idx + __item_offset; - if (__apply_aggs) - { - __out_values[__idx] = __binary_op(__carry_in, __loc_partials[__i - __start]); - __apply_aggs = false; - } - else - { - __out_values[__idx] = __loc_partials[__i - __start]; - } - if (__i != __n - 1) - { - __out_keys[__idx + 1] = __keys[__i + 1]; - } - ++__item_offset; - } - } - - // 2f. Output the work group aggregate and total number of segments for use in phase 3. - if (__local_id == __wgroup_size - 1) // last work item writes the group's carry out - { - // If no segment ends in the item, the aggregates from previous work groups must be applied. - if (__max_end == 0) - { - // needs to be inclusive with last element - __partials_acc[__group_id] = __binary_op(__carry_in, __accumulator); - } - else - { - __partials_acc[__group_id] = __accumulator; - } - } - }); - }); - - // 3. Apply inter work-group aggregates - __exec.queue() - .submit([&](sycl::handler& __cgh) { - oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values); - - auto __partials_acc = __partials.template get_access(__cgh); - auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); - auto __seg_ends_acc = __seg_ends.template get_access(__cgh); - auto __end_idx_acc = __end_idx.template get_access(__cgh); - - __dpl_sycl::__local_accessor<__val_type> __loc_partials_acc(__wgroup_size, __cgh); - __dpl_sycl::__local_accessor<__diff_type> __loc_seg_ends_acc(__wgroup_size, __cgh); - - __cgh.depends_on(__wg_reduce); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT - __cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle()); -#endif - __cgh.parallel_for<_SegReducePrefixKernel>( -#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT - __seg_reduce_prefix_kernel, -#endif - sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { - auto __group = __item.get_group(); - ::std::int64_t __group_id = __item.get_group(0); - ::std::size_t __global_id = __item.get_global_id(0); - ::std::size_t __local_id = __item.get_local_id(0); - - ::std::size_t __start = __global_id * __vals_per_item; - ::std::size_t __end = __dpl_sycl::__minimum{}(__start + __vals_per_item, __n); - ::std::size_t __item_segments = 0; - - ::std::int64_t __wg_agg_idx = __group_id - 1; - __val_type __agg_collector = unseq_backend::__known_identity<_BinaryOperator, __val_type>; - - bool __ag_exists = false; - // 3a. Check to see if an aggregate exists and compute that value in the first - // work item. - if (__group_id != 0) - { - __ag_exists = __start < __n; - // local reductions followed by a sweep - constexpr ::std::int32_t __vals_to_explore = 16; - bool __last_it = false; - __loc_seg_ends_acc[__local_id] = false; - __loc_partials_acc[__local_id] = unseq_backend::__known_identity<_BinaryOperator, __val_type>; - for (::std::int32_t __i = __wg_agg_idx - __vals_to_explore * __local_id; !__last_it; - __i -= __wgroup_size * __vals_to_explore) - { - __val_type __local_collector = unseq_backend::__known_identity<_BinaryOperator, __val_type>; - // exploration phase - for (::std::int32_t __j = __i; - __j > __dpl_sycl::__maximum<::std::int32_t>{}(-1L, __i - __vals_to_explore); --__j) - { - __local_collector = __binary_op(__partials_acc[__j], __local_collector); - if (__seg_ends_acc[__j] || __j == 0) - { - __loc_seg_ends_acc[__local_id] = true; - break; - } - } - __loc_partials_acc[__local_id] = __local_collector; - __dpl_sycl::__group_barrier(__item); - // serial aggregate collection and synchronization - if (__local_id == 0) - { - for (::std::size_t __j = 0; __j < __wgroup_size; ++__j) - { - __agg_collector = __binary_op(__loc_partials_acc[__j], __agg_collector); - if (__loc_seg_ends_acc[__j]) - { - __last_it = true; - break; - } - } - } - __agg_collector = __dpl_sycl::__group_broadcast(__item.get_group(), __agg_collector); - __last_it = __dpl_sycl::__group_broadcast(__item.get_group(), __last_it); - } - - // Check to see if aggregates exist. - // The last group must always stay to write the final index - __ag_exists = __dpl_sycl::__any_of_group(__group, __ag_exists); - if (!__ag_exists && __group_id != __n_groups - 1) - return; - } - // 3b. count the segment ends - for (::std::size_t __i = __start; __i < __end; ++__i) - if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1])) - ++__item_segments; - - ::std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group( - __group, __item_segments, __dpl_sycl::__plus()); - - // 3c. Determine prior index - ::std::size_t __wg_num_prior_segs = __seg_ends_scan_acc[__group_id]; - - // 3d. Second pass over the keys, reidentifying end segments and applying work group - // aggregates if appropriate. Both the key and reduction value are written to the final output at the - // computed index - ::std::size_t __item_offset = 0; - for (::std::size_t __i = __start; __i < __end; ++__i) - { - if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1])) - { - ::std::size_t __idx = __wg_num_prior_segs + __prior_segs_in_wg + __item_offset; - - // apply the aggregate if it is the first segment end in the workgroup only - if (__prior_segs_in_wg == 0 && __item_offset == 0 && __ag_exists) - __out_values[__idx] = __binary_op(__agg_collector, __out_values[__idx]); - - ++__item_offset; - // the last item must write the last index's position to return - if (__i == __n - 1) - __end_idx_acc[0] = __idx; - } - } - }); - }) - .wait(); - - return __end_idx.get_host_access()[0] + 1; -} -#endif - template ::std::pair @@ -615,9 +241,9 @@ reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& pol typename ::std::iterator_traits::value_type>::type; // number of unique keys - _CountType __n = __sycl_reduce_by_segment( - __tag, ::std::forward(policy), key_buf.all_view(), value_buf.all_view(), key_output_buf.all_view(), - value_output_buf.all_view(), binary_pred, binary_op, has_known_identity{}); + _CountType __n = + __pattern_reduce_by_segment(__tag, ::std::forward(policy), key_buf.all_view(), value_buf.all_view(), + key_output_buf.all_view(), value_output_buf.all_view(), binary_pred, binary_op); return ::std::make_pair(result1 + __n, result2 + __n); } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 7558e29b4d5..72782105d93 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -913,15 +913,22 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), - std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), - std::forward<_Range4>(__out_values), __binary_pred, __binary_op); - __res.wait(); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; - // TODO: this needs to be enabled if reduce then scan cannot be satisfied. - #if 0 +#if _ONEDPL_BACKEND_SYCL + // We would normally dispatch to the parallel implementation which would make the decision to invoke + // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose + // whether or not to use reduce-then-scan here. + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + { + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), + __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + } +#endif // The algorithm reduces values in __values where the // associated keys for the values are equal to the adjacent key. // @@ -1052,7 +1059,6 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& .__deferrable_wait(); return __result_end; - #endif } } // namespace __ranges From f5f861e0e0a6b49c247c0a14770b7f1a4c543e23 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 11:42:03 -0700 Subject: [PATCH 08/34] Switch from using lambdas to functors Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 131 +++++++++++------- 1 file changed, 82 insertions(+), 49 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index caaa1a74695..60356967249 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -788,6 +788,68 @@ struct __gen_transform_input _UnaryOp __unary_op; }; +template +struct __gen_red_by_seg_input +{ + template + auto + operator()(const _InRng& __in_rng, std::size_t __id) const + { + auto&& __in_keys = std::get<0>(__in_rng.tuple()); + auto&& __in_vals = std::get<1>(__in_rng.tuple()); + using _ValueType = oneapi::dpl::__internal::__value_t; + if (__id == 0 || __binary_pred(__in_keys[__id - 1], __in_keys[__id])) + return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__id]}); + return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__id]}); + } + _BinaryPred __binary_pred; +}; + +template +struct __red_by_seg_op +{ + template + auto + operator()(const _Tup1& __lhs_tup, const _Tup2& __rhs_tup) const + { + if (std::get<0>(__rhs_tup) == 0) + { + return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup), + __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup))); + } + return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup), + std::get<1>(__rhs_tup)); + } + _BinaryOp __binary_op; +}; + +template +struct __write_red_by_seg +{ + template + void + operator()(_InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const + { + auto&& __in_keys = std::get<0>(__in_rng.tuple()); + auto&& __out_keys = std::get<0>(__out_rng.tuple()); + auto&& __out_values = std::get<1>(__out_rng.tuple()); + // TODO: substantial improvement expected with special handling in kernel + // The first key must be output to __out_keys[__id] for a segment, so when we encounter a segment end we + // must output the current segment's value and the next segment's key. + if (__id == 0) + __out_keys[0] = __in_keys[0]; + if (__id == __n - 1) + __out_values[std::get<0>(__tup)] = std::get<1>(__tup); + else if (!__binary_pred(__in_keys[__id], __in_keys[__id + 1])) + { + __out_keys[std::get<0>(__tup) + 1] = __in_keys[__id + 1]; + __out_values[std::get<0>(__tup)] = std::get<1>(__tup); + } + } + _BinaryPred __binary_pred; + std::size_t __n; +}; + struct __simple_write_to_id { template @@ -1187,58 +1249,29 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t _CopyOp{_ReduceOp{}, _Assign{}}); } } -template + +template auto -__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) -{ - auto __n = __keys.size(); - auto __gen_reduce_input = [=](const auto& __in_rng, std::size_t __idx) { - auto&& __in_keys = std::get<0>(__in_rng.tuple()); - auto&& __in_vals = std::get<1>(__in_rng.tuple()); - using _ValueType = oneapi::dpl::__internal::__value_t; - if (__idx == 0 || __binary_pred(__in_keys[__idx - 1], __in_keys[__idx])) - return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__idx]}); - return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__idx]}); - }; - auto __reduce_op = [=](const auto& __lhs_tup, const auto& __rhs_tup) { - if (std::get<0>(__rhs_tup) == 0) - { - return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup), - __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup))); - } - return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup), - std::get<1>(__rhs_tup)); - }; - auto __gen_scan_input = __gen_reduce_input; - auto __scan_input_transform = oneapi::dpl::__internal::__no_op{}; - auto __write_out = [=](auto& __in_rng, auto& __out_rng, std::size_t __idx, const auto& __tup) { - auto&& __in_keys = std::get<0>(__in_rng.tuple()); - auto&& __out_keys = std::get<0>(__out_rng.tuple()); - auto&& __out_vals = std::get<1>(__out_rng.tuple()); - // TODO: substantial improvement expected with special handling in kernel - // The first key must be output to __out_keys[__idx] for a segment, so when we encounter a segment end we - // must output the current segment's value and the next segment's key. - if (__idx == 0) - __out_keys[0] = __in_keys[0]; - if (__idx == __n - 1) - __out_values[std::get<0>(__tup)] = std::get<1>(__tup); - else if (!__binary_pred(__in_keys[__idx], __in_keys[__idx + 1])) - { - __out_keys[std::get<0>(__tup) + 1] = __in_keys[__idx + 1]; - __out_values[std::get<0>(__tup)] = std::get<1>(__tup); - } - }; +__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +{ + using _GenReduceInput = __gen_red_by_seg_input<_BinaryPredicate>; + using _ReduceOp = __red_by_seg_op<_BinaryOperator>; + using _GenScanInput = _GenReduceInput; + using _ScanInputTransform = oneapi::dpl::__internal::__no_op; + using _WriteOp = __write_red_by_seg<_BinaryPredicate>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + std::size_t __n = __keys.size(); return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), - oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), - oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), - __gen_reduce_input, __reduce_op, __gen_scan_input, __scan_input_transform, - __write_out, oneapi::dpl::unseq_backend::__no_init_value>{}, - /*Inclusive*/std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); + __backend_tag, std::forward<_ExecutionPolicy>(__exec), + oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), + oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), + _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred}, _ScanInputTransform{}, + _WriteOp{__binary_pred, __n}, + oneapi::dpl::unseq_backend::__no_init_value>{}, + /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } template From 364a8894c0fc97cc5ebc81c52b3460e4de987562 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 12:04:35 -0700 Subject: [PATCH 09/34] Add device copyable specializations for red-by-seg functors and update testing Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +- .../dpl/pstl/hetero/dpcpp/sycl_traits.h | 29 ++++++++++++++++++ .../device_copyable.pass.cpp | 30 +++++++++++++++++++ 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 60356967249..e72bfacff70 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -870,7 +870,7 @@ struct __gen_mask { template bool - operator()(const _InRng& __in_rng, std::size_t __id) const + operator()(_InRng&& __in_rng, std::size_t __id) const { return __pred((__rng_transform(std::forward<_InRng>(__in_rng)))[__id]); } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h index 2d0e88fd34b..40713cf8621 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h @@ -236,6 +236,9 @@ namespace oneapi::dpl::__par_backend_hetero template struct __gen_transform_input; +template +struct __gen_red_by_seg_input; + template struct __gen_mask; @@ -254,12 +257,18 @@ struct __write_to_id_if; template struct __write_to_id_if_else; +template +struct __write_red_by_seg; + template struct __early_exit_find_or; template struct __leaf_sorter; +template +struct __red_by_seg_op; + } // namespace oneapi::dpl::__par_backend_hetero template @@ -276,6 +285,13 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen { }; +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_input, + _BinaryPred)> + : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPred> +{ +}; + template struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_unique_mask, _BinaryPredicate)> @@ -309,6 +325,13 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen { }; +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__write_red_by_seg, + _BinaryPred)> + : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPred> +{ +}; + template struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__early_exit_find_or, _ExecutionPolicy, _Pred)> @@ -323,6 +346,12 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen { }; +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__red_by_seg_op, _BinaryOp)> + : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryOp> +{ +}; + namespace oneapi::dpl::unseq_backend { diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index 0922c66c84a..ce57bb9d931 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -157,6 +157,11 @@ test_device_copyable() sycl::is_device_copyable_v>, "__gen_transform_input is not device copyable with device copyable types"); + //__gen_red_by_seg_input + static_assert( + sycl::is_device_copyable_v>, + "__gen_red_by_seg_input is not device copyable with device copyable types"); + //__gen_mask static_assert(sycl::is_device_copyable_v>, "__gen_mask is not device copyable with device copyable types"); @@ -186,6 +191,11 @@ test_device_copyable() sycl::is_device_copyable_v>, "__write_to_id_if_else is not device copyable with device copyable types"); + //__write_red_by_seg + static_assert( + sycl::is_device_copyable_v>, + "__write_red_by_seg is not device copyable with device copyable types"); + // __early_exit_find_or static_assert( sycl::is_device_copyable_v< @@ -201,6 +211,11 @@ test_device_copyable() noop_device_copyable>>, "__leaf_sorter is not device copyable with device copyable types"); + //__red_by_seg_op + static_assert( + sycl::is_device_copyable_v>, + "__red_by_seg_op is not device copyable with device copyable types"); + //__not_pred static_assert(sycl::is_device_copyable_v>, "__not_pred is not device copyable with device copyable types"); @@ -400,6 +415,11 @@ test_non_device_copyable() !sycl::is_device_copyable_v>, "__gen_transform_input is device copyable with non device copyable types"); + //__gen_red_by_seg_input + static_assert( + !sycl::is_device_copyable_v>, + "__gen_red_by_seg_input is device copyable with device copyable types"); + //__gen_mask static_assert(!sycl::is_device_copyable_v>, "__gen_mask is device copyable with non device copyable types"); @@ -429,6 +449,11 @@ test_non_device_copyable() oneapi::dpl::__par_backend_hetero::__write_to_id_if_else>, "__write_to_id_if_else is device copyable with non device copyable types"); + //__write_red_by_seg + static_assert( + !sycl::is_device_copyable_v>, + "__write_red_by_seg is device copyable with device copyable types"); + // __early_exit_find_or static_assert( !sycl::is_device_copyable_v>, "__leaf_sorter is device copyable with non device copyable types"); + //__red_by_seg_op + static_assert( + !sycl::is_device_copyable_v>, + "__red_by_seg_op is device copyable with device copyable types"); + //__not_pred static_assert(!sycl::is_device_copyable_v>, "__not_pred is device copyable with non device copyable types"); From 86cf9ebd6e80f29a78c8e30725156519a404cb6f Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 15 Oct 2024 12:08:05 -0700 Subject: [PATCH 10/34] Fix typo in error message in device_copyable.pass.cpp Signed-off-by: Matthew Michel --- .../general/implementation_details/device_copyable.pass.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index ce57bb9d931..f982bc394ab 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -418,7 +418,7 @@ test_non_device_copyable() //__gen_red_by_seg_input static_assert( !sycl::is_device_copyable_v>, - "__gen_red_by_seg_input is device copyable with device copyable types"); + "__gen_red_by_seg_input is device copyable with non device copyable types"); //__gen_mask static_assert(!sycl::is_device_copyable_v>, @@ -452,7 +452,7 @@ test_non_device_copyable() //__write_red_by_seg static_assert( !sycl::is_device_copyable_v>, - "__write_red_by_seg is device copyable with device copyable types"); + "__write_red_by_seg is device copyable with non device copyable types"); // __early_exit_find_or static_assert( @@ -468,7 +468,7 @@ test_non_device_copyable() //__red_by_seg_op static_assert( !sycl::is_device_copyable_v>, - "__red_by_seg_op is device copyable with device copyable types"); + "__red_by_seg_op is device copyable with non device copyable types"); //__not_pred static_assert(!sycl::is_device_copyable_v>, From 4f5ff6367882a026e70c0231a160faac6086d052 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 18 Oct 2024 07:20:31 -0700 Subject: [PATCH 11/34] Introduce separate input generation for scan phase and update tests Signed-off-by: Matthew Michel --- .../hetero/algorithm_ranges_impl_hetero.h | 2 +- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 66 +++++++++++++------ .../dpl/pstl/hetero/dpcpp/sycl_traits.h | 14 +++- .../device_copyable.pass.cpp | 22 +++++-- 4 files changed, 75 insertions(+), 29 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 72782105d93..2f42f5e4b60 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -919,7 +919,7 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& // whether or not to use reduce-then-scan here. if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment( + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), __binary_pred, __binary_op); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index e72bfacff70..81e4a14dcf2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -789,7 +789,7 @@ struct __gen_transform_input }; template -struct __gen_red_by_seg_input +struct __gen_red_by_seg_reduce_input { template auto @@ -798,11 +798,33 @@ struct __gen_red_by_seg_input auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; - if (__id == 0 || __binary_pred(__in_keys[__id - 1], __in_keys[__id])) - return oneapi::dpl::__internal::make_tuple(size_t{0}, _ValueType{__in_vals[__id]}); - return oneapi::dpl::__internal::make_tuple(size_t{1}, _ValueType{__in_vals[__id]}); + std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); + return oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}); } _BinaryPred __binary_pred; + std::size_t __n; +}; + +template +struct __gen_red_by_seg_scan_input +{ + template + auto + operator()(const _InRng& __in_rng, std::size_t __id) const + { + auto&& __in_keys = std::get<0>(__in_rng.tuple()); + auto&& __in_vals = std::get<1>(__in_rng.tuple()); + using _ValueType = oneapi::dpl::__internal::__value_t; + // Each beginning segment is marked with a flag to know when to stop reduce lower indexed inputs + std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); + // Each last element in a segment is marked with an output flag to store its reduction in the write phase + bool __output_mask = __id == __n - 1 || !__binary_pred(__in_keys[__id], __in_keys[__id + 1]); + const auto __candidate_key = __id < __n - 1 ? __in_keys[__id + 1] : __in_keys[__id]; + return oneapi::dpl::__internal::make_tuple(oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), + __output_mask, __candidate_key); + } + _BinaryPred __binary_pred; + std::size_t __n; }; template @@ -828,22 +850,26 @@ struct __write_red_by_seg { template void - operator()(_InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const + operator()(const _InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const { + using std::get; auto&& __in_keys = std::get<0>(__in_rng.tuple()); auto&& __out_keys = std::get<0>(__out_rng.tuple()); auto&& __out_values = std::get<1>(__out_rng.tuple()); - // TODO: substantial improvement expected with special handling in kernel - // The first key must be output to __out_keys[__id] for a segment, so when we encounter a segment end we - // must output the current segment's value and the next segment's key. + // TODO: substantial improvement expected with special handling in kernel of first and last sub-groups. + // The first key must be output to __out_keys for a segment, so when we encounter a segment end we + // must output the current segment's value and the next segment's key. For index zero we must special handle + // and write the first key from the current index. if (__id == 0) - __out_keys[0] = __in_keys[0]; + __out_keys[0] = __in_keys[0]; + // We are at the end of the input so there is no key to output for the next segment if (__id == __n - 1) - __out_values[std::get<0>(__tup)] = std::get<1>(__tup); - else if (!__binary_pred(__in_keys[__id], __in_keys[__id + 1])) + __out_values[get<0>(get<0>(__tup))] = get<1>(get<0>(__tup)); + // Update the current segment's output value and the next segment's key value + else if (get<1>(__tup)) { - __out_keys[std::get<0>(__tup) + 1] = __in_keys[__id + 1]; - __out_values[std::get<0>(__tup)] = std::get<1>(__tup); + __out_keys[get<0>(get<0>(__tup)) + 1] = get<2>(__tup); + __out_values[get<0>(get<0>(__tup))] = get<1>(get<0>(__tup)); } } _BinaryPred __binary_pred; @@ -1253,14 +1279,14 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t template auto -__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +__parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { - using _GenReduceInput = __gen_red_by_seg_input<_BinaryPredicate>; + using _GenReduceInput = __gen_red_by_seg_reduce_input<_BinaryPredicate>; using _ReduceOp = __red_by_seg_op<_BinaryOperator>; - using _GenScanInput = _GenReduceInput; - using _ScanInputTransform = oneapi::dpl::__internal::__no_op; + using _GenScanInput = __gen_red_by_seg_scan_input<_BinaryPredicate>; + using _ScanInputTransform = __get_zeroth_element; using _WriteOp = __write_red_by_seg<_BinaryPredicate>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; std::size_t __n = __keys.size(); @@ -1268,7 +1294,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag __bac __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), - _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred}, _ScanInputTransform{}, + _GenReduceInput{__binary_pred, __n}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{}, _WriteOp{__binary_pred, __n}, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h index 40713cf8621..7d3fd829cc5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_traits.h @@ -237,7 +237,10 @@ template struct __gen_transform_input; template -struct __gen_red_by_seg_input; +struct __gen_red_by_seg_reduce_input; + +template +struct __gen_red_by_seg_scan_input; template struct __gen_mask; @@ -286,7 +289,14 @@ struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backen }; template -struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_input, +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input, + _BinaryPred)> + : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPred> +{ +}; + +template +struct sycl::is_device_copyable<_ONEDPL_SPECIALIZE_FOR(oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input, _BinaryPred)> : oneapi::dpl::__internal::__are_all_device_copyable<_BinaryPred> { diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index f982bc394ab..97707a50119 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -157,10 +157,15 @@ test_device_copyable() sycl::is_device_copyable_v>, "__gen_transform_input is not device copyable with device copyable types"); - //__gen_red_by_seg_input + //__gen_red_by_seg_reduce_input static_assert( - sycl::is_device_copyable_v>, - "__gen_red_by_seg_input is not device copyable with device copyable types"); + sycl::is_device_copyable_v>, + "__gen_red_by_seg_reduce_input is not device copyable with device copyable types"); + + //__gen_red_by_seg_scan_input + static_assert( + sycl::is_device_copyable_v>, + "__gen_red_by_seg_scan_input is not device copyable with device copyable types"); //__gen_mask static_assert(sycl::is_device_copyable_v>, @@ -415,10 +420,15 @@ test_non_device_copyable() !sycl::is_device_copyable_v>, "__gen_transform_input is device copyable with non device copyable types"); - //__gen_red_by_seg_input + //__gen_red_by_seg_reduce_input + static_assert( + !sycl::is_device_copyable_v>, + "__gen_red_by_seg_reduce_input is device copyable with non device copyable types"); + + //__gen_red_by_seg_reduce_input static_assert( - !sycl::is_device_copyable_v>, - "__gen_red_by_seg_input is device copyable with non device copyable types"); + !sycl::is_device_copyable_v>, + "__gen_red_by_seg_scan_input is device copyable with non device copyable types"); //__gen_mask static_assert(!sycl::is_device_copyable_v>, From 6b2b3289ae33ce65c55ba830ac00b1f005932f82 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 18 Oct 2024 09:09:56 -0700 Subject: [PATCH 12/34] Improve code readability Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 65 +++++++++++-------- 1 file changed, 38 insertions(+), 27 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 81e4a14dcf2..6a27f1e4539 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -808,20 +808,28 @@ struct __gen_red_by_seg_reduce_input template struct __gen_red_by_seg_scan_input { + // Returns the following tuple: + // ((new_seg_mask: size_t, value: ValueType), output_value: bool, candidate_key: KeyType) + // new_seg_mask : 1 for a start of a new segment, 0 otherwise + // value : Current element's value for reduction + // output_value : Whether this work-item should write an output + // candidate_key: The key of the next segment to write if output_value is true. template auto operator()(const _InRng& __in_rng, std::size_t __id) const { - auto&& __in_keys = std::get<0>(__in_rng.tuple()); - auto&& __in_vals = std::get<1>(__in_rng.tuple()); + auto __in_keys = std::get<0>(__in_rng.tuple()); + auto __in_vals = std::get<1>(__in_rng.tuple()); + using _KeyType = oneapi::dpl::__internal::__value_t; using _ValueType = oneapi::dpl::__internal::__value_t; - // Each beginning segment is marked with a flag to know when to stop reduce lower indexed inputs std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); - // Each last element in a segment is marked with an output flag to store its reduction in the write phase - bool __output_mask = __id == __n - 1 || !__binary_pred(__in_keys[__id], __in_keys[__id + 1]); - const auto __candidate_key = __id < __n - 1 ? __in_keys[__id + 1] : __in_keys[__id]; - return oneapi::dpl::__internal::make_tuple(oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), - __output_mask, __candidate_key); + if (__id == __n - 1) + return oneapi::dpl::__internal::make_tuple( + oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, + _KeyType{__in_keys[__id]}); // __in_keys[__id] is an unused placeholder + return oneapi::dpl::__internal::make_tuple( + oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), + !__binary_pred(__in_keys[__id], __in_keys[__id + 1]), _KeyType{__in_keys[__id + 1]}); } _BinaryPred __binary_pred; std::size_t __n; @@ -834,13 +842,15 @@ struct __red_by_seg_op auto operator()(const _Tup1& __lhs_tup, const _Tup2& __rhs_tup) const { + using std::get; + // The left-hand side has processed elements from the same segment, so update the reduction value. if (std::get<0>(__rhs_tup) == 0) { - return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup), - __binary_op(std::get<1>(__lhs_tup), std::get<1>(__rhs_tup))); + return oneapi::dpl::__internal::make_tuple(get<0>(__lhs_tup), + __binary_op(get<1>(__lhs_tup), get<1>(__rhs_tup))); } - return oneapi::dpl::__internal::make_tuple(std::get<0>(__lhs_tup) + std::get<0>(__rhs_tup), - std::get<1>(__rhs_tup)); + // We are looking at elements from a previous segment so just update the output index. + return oneapi::dpl::__internal::make_tuple(get<0>(__lhs_tup) + get<0>(__rhs_tup), get<1>(__rhs_tup)); } _BinaryOp __binary_op; }; @@ -853,23 +863,24 @@ struct __write_red_by_seg operator()(const _InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const { using std::get; - auto&& __in_keys = std::get<0>(__in_rng.tuple()); - auto&& __out_keys = std::get<0>(__out_rng.tuple()); - auto&& __out_values = std::get<1>(__out_rng.tuple()); - // TODO: substantial improvement expected with special handling in kernel of first and last sub-groups. - // The first key must be output to __out_keys for a segment, so when we encounter a segment end we - // must output the current segment's value and the next segment's key. For index zero we must special handle - // and write the first key from the current index. + auto __in_keys = get<0>(__in_rng.tuple()); + auto __out_keys = get<0>(__out_rng.tuple()); + auto __out_values = get<1>(__out_rng.tuple()); + using _KeyType = oneapi::dpl::__internal::__value_t; + using _ValType = oneapi::dpl::__internal::__value_t; + + const _KeyType& __next_segment_key = get<2>(__tup); + const _ValType& __cur_segment_value = get<1>(get<0>(__tup)); + const bool __is_seg_end = get<1>(__tup); + const std::size_t __out_idx = get<0>(get<0>(__tup)); + if (__id == 0) - __out_keys[0] = __in_keys[0]; - // We are at the end of the input so there is no key to output for the next segment - if (__id == __n - 1) - __out_values[get<0>(get<0>(__tup))] = get<1>(get<0>(__tup)); - // Update the current segment's output value and the next segment's key value - else if (get<1>(__tup)) + __out_keys[0] = __in_keys[0]; + if (__is_seg_end) { - __out_keys[get<0>(get<0>(__tup)) + 1] = get<2>(__tup); - __out_values[get<0>(get<0>(__tup))] = get<1>(get<0>(__tup)); + __out_values[__out_idx] = __cur_segment_value; + if (__id != __n - 1) + __out_keys[__out_idx + 1] = __next_segment_key; } } _BinaryPred __binary_pred; From b0d04e52514495e7072c9ed6711893871e03ab2c Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 18 Oct 2024 09:46:54 -0700 Subject: [PATCH 13/34] Add optional first key field to scan input and remove input range in write operations Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 6a27f1e4539..4c2ad58244d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -28,6 +28,7 @@ #include #include #include +#include #include "../../iterator_impl.h" #include "../../execution_impl.h" @@ -795,8 +796,8 @@ struct __gen_red_by_seg_reduce_input auto operator()(const _InRng& __in_rng, std::size_t __id) const { - auto&& __in_keys = std::get<0>(__in_rng.tuple()); - auto&& __in_vals = std::get<1>(__in_rng.tuple()); + auto __in_keys = std::get<0>(__in_rng.tuple()); + auto __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); return oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}); @@ -821,15 +822,19 @@ struct __gen_red_by_seg_scan_input auto __in_keys = std::get<0>(__in_rng.tuple()); auto __in_vals = std::get<1>(__in_rng.tuple()); using _KeyType = oneapi::dpl::__internal::__value_t; + using _OptKeyType = std::optional<_KeyType>; using _ValueType = oneapi::dpl::__internal::__value_t; + _OptKeyType __first_key; + if (__id == 0) + __first_key = _OptKeyType{__in_keys[0]}; std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); if (__id == __n - 1) return oneapi::dpl::__internal::make_tuple( oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, - _KeyType{__in_keys[__id]}); // __in_keys[__id] is an unused placeholder + _KeyType{__in_keys[__id]}, __first_key); // __in_keys[__id] is an unused placeholder return oneapi::dpl::__internal::make_tuple( oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), - !__binary_pred(__in_keys[__id], __in_keys[__id + 1]), _KeyType{__in_keys[__id + 1]}); + !__binary_pred(__in_keys[__id], __in_keys[__id + 1]), _KeyType{__in_keys[__id + 1]}, __first_key); } _BinaryPred __binary_pred; std::size_t __n; @@ -858,12 +863,11 @@ struct __red_by_seg_op template struct __write_red_by_seg { - template + template void - operator()(const _InRng& __in_rng, _OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const + operator()(_OutRng& __out_rng, std::size_t __id, const _Tup& __tup) const { using std::get; - auto __in_keys = get<0>(__in_rng.tuple()); auto __out_keys = get<0>(__out_rng.tuple()); auto __out_values = get<1>(__out_rng.tuple()); using _KeyType = oneapi::dpl::__internal::__value_t; @@ -875,7 +879,7 @@ struct __write_red_by_seg const std::size_t __out_idx = get<0>(get<0>(__tup)); if (__id == 0) - __out_keys[0] = __in_keys[0]; + __out_keys[0] = *get<3>(__tup); if (__is_seg_end) { __out_values[__out_idx] = __cur_segment_value; @@ -889,9 +893,9 @@ struct __write_red_by_seg struct __simple_write_to_id { - template + template void - operator()(const _InRng&, _OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const + operator()(_OutRng& __out_rng, std::size_t __id, const _ValueType& __v) const { // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through. @@ -1038,9 +1042,9 @@ struct __get_zeroth_element template struct __write_to_id_if { - template + template void - operator()(const _InRng&, _OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const + operator()(_OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const { // Use of an explicit cast to our internal tuple type is required to resolve conversion issues between our // internal tuple and std::tuple. If the underlying type is not a tuple, then the type will just be passed through. @@ -1056,9 +1060,9 @@ struct __write_to_id_if template struct __write_to_id_if_else { - template + template void - operator()(const _InRng&, _OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const + operator()(_OutRng& __out_rng, _SizeType __id, const _ValueType& __v) const { using _ConvertedTupleType = typename oneapi::dpl::__internal::__get_tuple_type(__v))>, From af445321ce5ebb12154626e28d544762615304d2 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 18 Oct 2024 09:49:30 -0700 Subject: [PATCH 14/34] Update __write_op in reduce-then-scan Signed-off-by: Matthew Michel --- .../dpcpp/parallel_backend_sycl_reduce_then_scan.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 09c0d754bd7..8c0762f2a38 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -173,7 +173,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__in_rng, __out_rng, __start_id, __v); + __write_op(__out_rng, __start_id, __v); } if (__is_full_block) @@ -187,7 +187,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__in_rng, __out_rng, __start_id + __j * __sub_group_size, __v); + __write_op(__out_rng, __start_id + __j * __sub_group_size, __v); } } } @@ -202,7 +202,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__in_rng, __out_rng, __start_id + __j * __sub_group_size, __v); + __write_op(__out_rng, __start_id + __j * __sub_group_size, __v); } } } @@ -225,7 +225,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI if constexpr (__capture_output) { if (__start_id < __n) - __write_op(__in_rng, __out_rng, __start_id, __v); + __write_op(__out_rng, __start_id, __v); } } else @@ -235,7 +235,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__in_rng, __out_rng, __start_id, __v); + __write_op(__out_rng, __start_id, __v); } for (std::uint32_t __j = 1; __j < __iters - 1; __j++) @@ -246,7 +246,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI __sub_group, __scan_input_transform(__v), __binary_op, __sub_group_carry); if constexpr (__capture_output) { - __write_op(__in_rng, __out_rng, __local_id, __v); + __write_op(__out_rng, __local_id, __v); } } @@ -259,7 +259,7 @@ __scan_through_elements_helper(const __dpl_sycl::__sub_group& __sub_group, _GenI if constexpr (__capture_output) { if (__offset < __n) - __write_op(__in_rng, __out_rng, __offset, __v); + __write_op(__out_rng, __offset, __v); } } } From 9066f80a378bddecb8f029b5653f98445473bd6c Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 18 Oct 2024 13:17:58 -0700 Subject: [PATCH 15/34] Remove now unneeded ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION macro Signed-off-by: Matthew Michel --- CMakeLists.txt | 10 -------- cmake/README.md | 1 - .../pstl/hetero/dpcpp/unseq_backend_sycl.h | 17 +++----------- test/CMakeLists.txt | 5 ---- .../numeric.ops/reduce_by_segment.pass.cpp | 23 ++++--------------- 5 files changed, 8 insertions(+), 48 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cbe9a214a1f..d8686e3eb47 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -293,16 +293,6 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$") endif() endif() - if (DEFINED ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) - if(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) - message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1 option") - target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1") - else() - message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0 option") - target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0") - endif() - endif() - # DPC++ specific macro target_compile_definitions(oneDPL INTERFACE $<$,$>:ONEDPL_FPGA_DEVICE> diff --git a/cmake/README.md b/cmake/README.md index 7335b7e2312..0683a377820 100644 --- a/cmake/README.md +++ b/cmake/README.md @@ -18,7 +18,6 @@ The following variables are provided for oneDPL configuration: | ONEDPL_AOT_ARCH | STRING | Architecture options for ahead-of-time compilation, supported values can be found [here](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html) | "*" for GPU device and "avx" for CPU device | | ONEDPL_TEST_EXPLICIT_KERNEL_NAMES | STRING | Control kernel naming. Affects only oneDPL test targets. Supported values: AUTO, ALWAYS. AUTO: rely on the compiler if "Unnamed SYCL lambda kernels" feature is on, otherwise provide kernel names explicitly; ALWAYS: provide kernel names explicitly | AUTO | | ONEDPL_TEST_WIN_ICX_FIXES | BOOL | Affects only oneDPL test targets. Enable icx, icx-cl workarounds to fix issues in CMake for Windows. | ON | -| ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION | BOOL | Use as a workaround for incorrect results, which may be produced by reduction algorithms with 64-bit data types compiled by the Intel® oneAPI DPC++/C++ Compiler and executed on GPU devices. | | Some useful CMake variables ([here](https://cmake.org/cmake/help/latest/manual/cmake-variables.7.html) you can find a full list of CMake variables for the latest version): diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index 2caa6add318..9bf4325f221 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -34,21 +34,11 @@ namespace unseq_backend //This optimization depends on Intel(R) oneAPI DPC++ Compiler implementation such as support of binary operators from std namespace. //We need to use defined(SYCL_IMPLEMENTATION_INTEL) macro as a guard. -template -inline constexpr bool __can_use_known_identity = -# if ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION - // When ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION is defined as non-zero, we avoid using known identity for 64-bit arithmetic data types - !(::std::is_arithmetic_v<_Tp> && sizeof(_Tp) == sizeof(::std::uint64_t)); -# else - true; -# endif // ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION - //TODO: To change __has_known_identity implementation as soon as the Intel(R) oneAPI DPC++ Compiler implementation issues related to //std::multiplies, std::bit_or, std::bit_and and std::bit_xor operations will be fixed. //std::logical_and and std::logical_or are not supported in Intel(R) oneAPI DPC++ Compiler to be used in sycl::inclusive_scan_over_group and sycl::reduce_over_group template -using __has_known_identity = ::std::conditional_t< - __can_use_known_identity<_Tp>, +using __has_known_identity = # if _ONEDPL_LIBSYCL_VERSION >= 50200 typename ::std::disjunction< __dpl_sycl::__has_known_identity<_BinaryOp, _Tp>, @@ -60,16 +50,15 @@ using __has_known_identity = ::std::conditional_t< ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<_Tp>>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<_Tp>>, - ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum>>>>, + ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum>>>>; # else //_ONEDPL_LIBSYCL_VERSION >= 50200 typename ::std::conjunction< ::std::is_arithmetic<_Tp>, ::std::disjunction<::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<_Tp>>, ::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<_Tp>>, - ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus>>>, + ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus>>>; # endif //_ONEDPL_LIBSYCL_VERSION >= 50200 - ::std::false_type>; // This is for the case of __can_use_known_identity<_Tp>==false #else //_ONEDPL_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 90eb3d5c737..e85e8e9f5f8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -195,7 +195,6 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators) string(REPLACE "\.cpp" "" _test_name ${_test_name}) set(coal_tests "reduce.pass" "transform_reduce.pass" "count.pass" "sycl_iterator_reduce.pass" "minmax_element.pass") - set(workaround_for_igpu_64bit_reduction_tests "reduce_by_segment.pass") # mark those tests with pstloffload_smoke_tests label set (pstloffload_smoke_tests "adjacent_find.pass" "copy_move.pass" "merge.pass" "partial_sort.pass" "remove_copy.pass" "transform_reduce.pass" "transform_reduce.pass.coal" "transform_scan.pass" "algorithm.pass" @@ -209,10 +208,6 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators) if (_test_name IN_LIST coal_tests) onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=1" "${extra_test_label}") onedpl_construct_exec(${test_source_file} ${_test_name}.coal ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=0" "${extra_test_label}") - elseif (_test_name IN_LIST workaround_for_igpu_64bit_reduction_tests) - onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}") - string(REPLACE "\.pass" "_workaround_64bit_reduction\.pass" _test_name ${_test_name}) - onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1" "${extra_test_label}") elseif(_test_name STREQUAL "free_after_unload.pass") onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}") onedpl_construct_exec(${test_source_file} ${_test_name}.after_pstl_offload ${switch_off_checked_iterators} "" "${extra_test_label}") diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp index 4de95e26e9b..c75be5f2694 100644 --- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp @@ -13,14 +13,6 @@ // //===----------------------------------------------------------------------===// -#if defined(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) -#undef ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION -#endif - -#if defined(_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) -# define ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION _ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION -#endif - #include "support/test_config.h" #include "oneapi/dpl/execution" @@ -307,17 +299,12 @@ run_test_on_device() { #if TEST_DPCPP_BACKEND_PRESENT // Skip 64-byte types testing when the algorithm is broken and there is no the workaround -#if _PSTL_ICPX_TEST_RED_BY_SEG_BROKEN_64BIT_TYPES && !ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION - if constexpr (sizeof(ValueType) != 8) -#endif + if (TestUtils::has_type_support(TestUtils::get_test_queue().get_device())) { - if (TestUtils::has_type_support(TestUtils::get_test_queue().get_device())) - { - // Run tests for USM shared memory - test4buffers>(); - // Run tests for USM device memory - test4buffers>(); - } + // Run tests for USM shared memory + test4buffers>(); + // Run tests for USM device memory + test4buffers>(); } #endif // TEST_DPCPP_BACKEND_PRESENT } From c5ff176c19bd4178d5a9a0095079efe80ce5c92e Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 21 Oct 2024 13:53:35 -0500 Subject: [PATCH 16/34] Alternate testing between usm shared and device to prevent excessive binary size Signed-off-by: Matthew Michel --- .../numeric.ops/reduce_by_segment.pass.cpp | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp index c75be5f2694..2cee63239b6 100644 --- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp @@ -293,17 +293,14 @@ test_flag_pred() } #endif -template +template void run_test_on_device() { #if TEST_DPCPP_BACKEND_PRESENT - // Skip 64-byte types testing when the algorithm is broken and there is no the workaround if (TestUtils::has_type_support(TestUtils::get_test_queue().get_device())) { - // Run tests for USM shared memory - test4buffers>(); - // Run tests for USM device memory + constexpr sycl::usm::alloc allocation_type = use_device_alloc ? sycl::usm::alloc::device : sycl::usm::alloc::shared; test4buffers>(); } #endif // TEST_DPCPP_BACKEND_PRESENT @@ -322,12 +319,12 @@ run_test_on_host() #endif // !_PSTL_ICC_TEST_SIMD_UDS_BROKEN && !_PSTL_ICPX_TEST_RED_BY_SEG_OPTIMIZER_CRASH } -template +template void run_test() { run_test_on_host(); - run_test_on_device(); + run_test_on_device(); } int @@ -337,7 +334,7 @@ main() // kernels. This is being filed to the compiler team. In the meantime, we can rearrange this test // to resolve the issue on our side. #if _PSTL_RED_BY_SEG_WINDOWS_COMPILE_ORDER_BROKEN - run_test, UserBinaryPredicate>, MaxFunctor>>(); + run_test, UserBinaryPredicate>, MaxFunctor>>(); #endif #if TEST_DPCPP_BACKEND_PRESENT @@ -347,17 +344,17 @@ main() #endif // TEST_DPCPP_BACKEND_PRESENT #if !_PSTL_RED_BY_SEG_WINDOWS_COMPILE_ORDER_BROKEN - run_test, UserBinaryPredicate>, MaxFunctor>>(); + run_test, UserBinaryPredicate>, MaxFunctor>>(); #endif - run_test, ::std::plus>(); - run_test, ::std::plus>(); - run_test, ::std::plus>(); + run_test, ::std::plus>(); + run_test, ::std::plus>(); + run_test, ::std::plus>(); // TODO investigate possible overflow: see issue #1416 - run_test_on_device, ::std::multiplies>(); - run_test_on_device, ::std::multiplies>(); - run_test_on_device, ::std::multiplies>(); + run_test_on_device, ::std::multiplies>(); + run_test_on_device, ::std::multiplies>(); + run_test_on_device, ::std::multiplies>(); return TestUtils::done(); } From 57dc125b7d0f5ab80ec1b97b89554677b1d743e0 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 21 Oct 2024 13:48:28 -0700 Subject: [PATCH 17/34] Performance tuning within scan input functor Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 52 +++++++++++++------ 1 file changed, 35 insertions(+), 17 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 4c2ad58244d..07eb9716c26 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -28,7 +28,6 @@ #include #include #include -#include #include "../../iterator_impl.h" #include "../../execution_impl.h" @@ -810,11 +809,12 @@ template struct __gen_red_by_seg_scan_input { // Returns the following tuple: - // ((new_seg_mask: size_t, value: ValueType), output_value: bool, candidate_key: KeyType) - // new_seg_mask : 1 for a start of a new segment, 0 otherwise - // value : Current element's value for reduction - // output_value : Whether this work-item should write an output - // candidate_key: The key of the next segment to write if output_value is true. + // ((new_seg_mask, value), output_value, next_key, current_key) + // size_t new_seg_mask : 1 for a start of a new segment, 0 otherwise + // ValueType value : Current element's value for reduction + // bool output_value : Whether this work-item should write an output (end of segment) + // KeyType next_key : The key of the next segment to write if output_value is true + // KeyType current_key : The current element's key. This is only ever used by work-item 0 to write the first key template auto operator()(const _InRng& __in_rng, std::size_t __id) const @@ -822,21 +822,37 @@ struct __gen_red_by_seg_scan_input auto __in_keys = std::get<0>(__in_rng.tuple()); auto __in_vals = std::get<1>(__in_rng.tuple()); using _KeyType = oneapi::dpl::__internal::__value_t; - using _OptKeyType = std::optional<_KeyType>; using _ValueType = oneapi::dpl::__internal::__value_t; - _OptKeyType __first_key; - if (__id == 0) - __first_key = _OptKeyType{__in_keys[0]}; - std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); - if (__id == __n - 1) + const _KeyType& __current_key = __in_keys[__id]; + // Ordering the most common condition first has yielded the best results. + if (__id > 0 && __id < __n - 1) + { + const _KeyType& __prev_key = __in_keys[__id - 1]; + const _KeyType& __next_key = __in_keys[__id + 1]; + std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); + return oneapi::dpl::__internal::make_tuple( + oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), + !__binary_pred(__current_key, __next_key), + __next_key, __current_key); + } + else if (__id == __n - 1) + { + const _KeyType& __prev_key = __in_keys[__id - 1]; + std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); return oneapi::dpl::__internal::make_tuple( oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, - _KeyType{__in_keys[__id]}, __first_key); // __in_keys[__id] is an unused placeholder - return oneapi::dpl::__internal::make_tuple( - oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), - !__binary_pred(__in_keys[__id], __in_keys[__id + 1]), _KeyType{__in_keys[__id + 1]}, __first_key); + __current_key, __current_key); // Passing __current_key as the next key for the last element is a placeholder + } + else + { + const _KeyType& __next_key = __in_keys[__id + 1]; + return oneapi::dpl::__internal::make_tuple( + oneapi::dpl::__internal::make_tuple(std::size_t{0}, _ValueType{__in_vals[__id]}), + !__binary_pred(__current_key, __next_key), __next_key, __current_key); + } } _BinaryPred __binary_pred; + // For correctness of the function call operator, __n must be greater than 1. std::size_t __n; }; @@ -879,7 +895,7 @@ struct __write_red_by_seg const std::size_t __out_idx = get<0>(get<0>(__tup)); if (__id == 0) - __out_keys[0] = *get<3>(__tup); + __out_keys[0] = get<3>(__tup); if (__is_seg_end) { __out_values[__out_idx] = __cur_segment_value; @@ -1305,6 +1321,8 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ using _WriteOp = __write_red_by_seg<_BinaryPredicate>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; std::size_t __n = __keys.size(); + // __gen_red_by_seg_scan_input requires that __n > 1 + assert(__n > 1); return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), From 3e2e7e733d02c0daa0f92c9d9cdc3eef1b094c31 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 21 Oct 2024 13:49:44 -0700 Subject: [PATCH 18/34] Handle n=0, n=1 first in reduce_by_segment Signed-off-by: Matthew Michel --- .../hetero/algorithm_ranges_impl_hetero.h | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 2f42f5e4b60..7179e1f864a 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -913,22 +913,6 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { -#if _ONEDPL_BACKEND_SYCL - // We would normally dispatch to the parallel implementation which would make the decision to invoke - // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose - // whether or not to use reduce-then-scan here. - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) - { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), - std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), - __binary_pred, __binary_op); - __res.wait(); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; - } -#endif // The algorithm reduces values in __values where the // associated keys for the values are equal to the adjacent key. // @@ -960,6 +944,22 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& return 1; } +#if _ONEDPL_BACKEND_SYCL + // We would normally dispatch to the parallel implementation which would make the decision to invoke + // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose + // whether or not to use reduce-then-scan here. + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + { + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), + __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + } +#endif using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; From c5efc99105aae9efe13b4c2d02ae54ca55d83c5c Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 22 Oct 2024 08:50:00 -0700 Subject: [PATCH 19/34] Code cleanup Signed-off-by: Matthew Michel --- .../dpl/internal/reduce_by_segment_impl.h | 56 +++---------------- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 20 +++---- 2 files changed, 19 insertions(+), 57 deletions(-) diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h index 23259e5e550..8a04717c5c0 100644 --- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h +++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h @@ -57,9 +57,8 @@ #include "../pstl/utils_ranges.h" #include "../pstl/hetero/dpcpp/utils_ranges_sycl.h" #include "../pstl/ranges_defs.h" -#include "../pstl/glue_algorithm_ranges_impl.h" +#include "../pstl/hetero/algorithm_ranges_impl_hetero.h" #include "../pstl/hetero/dpcpp/sycl_traits.h" //SYCL traits specialization for some oneDPL types. -#include "scan_by_segment_impl.h" #endif namespace oneapi @@ -169,42 +168,9 @@ reduce_by_segment_impl(_Tag, Policy&& policy, InputIterator1 first1, InputIterat #if _ONEDPL_BACKEND_SYCL -template -class __seg_reduce_count_kernel; -template -class __seg_reduce_offset_kernel; -template -class __seg_reduce_wg_kernel; -template -class __seg_reduce_prefix_kernel; - -namespace -{ -template -using _SegReduceCountPhase = __seg_reduce_count_kernel<_Name...>; -template -using _SegReduceOffsetPhase = __seg_reduce_offset_kernel<_Name...>; -template -using _SegReduceWgPhase = __seg_reduce_wg_kernel<_Name...>; -template -using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; -} // namespace - -template -oneapi::dpl::__internal::__difference_t<_Range3> -__pattern_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) -{ - return oneapi::dpl::experimental::ranges::reduce_by_segment( - ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__keys), ::std::forward<_Range2>(__values), - ::std::forward<_Range3>(__out_keys), ::std::forward<_Range4>(__out_values), __binary_pred, __binary_op); -} - template -::std::pair +std::pair reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& policy, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, OutputIterator1 result1, OutputIterator2 result2, BinaryPred binary_pred, BinaryOperator binary_op) @@ -218,14 +184,14 @@ reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& pol // keys_result = { 1, 2, 3, 4, 1, 3, 1, 3, 0 } -- result1 // values_result = { 1, 2, 3, 4, 2, 6, 2, 6, 0 } -- result2 - using _CountType = ::std::uint64_t; + using _CountType = std::uint64_t; namespace __bknd = __par_backend_hetero; - const auto n = ::std::distance(first1, last1); + const auto n = std::distance(first1, last1); if (n == 0) - return ::std::make_pair(result1, result2); + return std::make_pair(result1, result2); auto keep_keys = __ranges::__get_sycl_range<__bknd::access_mode::read, InputIterator1>(); auto key_buf = keep_keys(first1, last1); @@ -236,16 +202,12 @@ reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& pol auto keep_value_outputs = __ranges::__get_sycl_range<__bknd::access_mode::write, OutputIterator2>(); auto value_output_buf = keep_value_outputs(result2, result2 + n); - using has_known_identity = - typename unseq_backend::__has_known_identity::value_type>::type; - // number of unique keys - _CountType __n = - __pattern_reduce_by_segment(__tag, ::std::forward(policy), key_buf.all_view(), value_buf.all_view(), - key_output_buf.all_view(), value_output_buf.all_view(), binary_pred, binary_op); + _CountType __n = oneapi::dpl::__internal::__ranges::__pattern_reduce_by_segment( + __tag, std::forward(policy), key_buf.all_view(), value_buf.all_view(), key_output_buf.all_view(), + value_output_buf.all_view(), binary_pred, binary_op); - return ::std::make_pair(result1 + __n, result2 + __n); + return std::make_pair(result1 + __n, result2 + __n); } #endif } // namespace internal diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 07eb9716c26..531b0acbf14 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -831,17 +831,16 @@ struct __gen_red_by_seg_scan_input const _KeyType& __next_key = __in_keys[__id + 1]; std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); return oneapi::dpl::__internal::make_tuple( - oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), - !__binary_pred(__current_key, __next_key), - __next_key, __current_key); + oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), + !__binary_pred(__current_key, __next_key), __next_key, __current_key); } else if (__id == __n - 1) { const _KeyType& __prev_key = __in_keys[__id - 1]; std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); return oneapi::dpl::__internal::make_tuple( - oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, - __current_key, __current_key); // Passing __current_key as the next key for the last element is a placeholder + oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, __current_key, + __current_key); // Passing __current_key as the next key for the last element is a placeholder } else { @@ -889,18 +888,19 @@ struct __write_red_by_seg using _KeyType = oneapi::dpl::__internal::__value_t; using _ValType = oneapi::dpl::__internal::__value_t; - const _KeyType& __next_segment_key = get<2>(__tup); - const _ValType& __cur_segment_value = get<1>(get<0>(__tup)); + const _KeyType& __next_key = get<2>(__tup); + const _KeyType& __current_key = get<3>(__tup); + const _ValType& __current_value = get<1>(get<0>(__tup)); const bool __is_seg_end = get<1>(__tup); const std::size_t __out_idx = get<0>(get<0>(__tup)); if (__id == 0) - __out_keys[0] = get<3>(__tup); + __out_keys[0] = __current_key; if (__is_seg_end) { - __out_values[__out_idx] = __cur_segment_value; + __out_values[__out_idx] = __current_value; if (__id != __n - 1) - __out_keys[__out_idx + 1] = __next_segment_key; + __out_keys[__out_idx + 1] = __next_key; } } _BinaryPred __binary_pred; From 4f2432f4c060a6fa729921dc515d8559101efec5 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 22 Oct 2024 09:16:12 -0700 Subject: [PATCH 20/34] Improve comments and mark relevant variables as const Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 531b0acbf14..dc2253ea77f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -791,18 +791,21 @@ struct __gen_transform_input template struct __gen_red_by_seg_reduce_input { + // Returns the following tuple: + // (new_seg_mask, value) + // size_t new_seg_mask : 1 for a start of a new segment, 0 otherwise + // ValueType value : Current element's value for reduction template auto operator()(const _InRng& __in_rng, std::size_t __id) const { - auto __in_keys = std::get<0>(__in_rng.tuple()); - auto __in_vals = std::get<1>(__in_rng.tuple()); + const auto __in_keys = std::get<0>(__in_rng.tuple()); + const auto __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; - std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); + const std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); return oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}); } _BinaryPred __binary_pred; - std::size_t __n; }; template @@ -819,8 +822,8 @@ struct __gen_red_by_seg_scan_input auto operator()(const _InRng& __in_rng, std::size_t __id) const { - auto __in_keys = std::get<0>(__in_rng.tuple()); - auto __in_vals = std::get<1>(__in_rng.tuple()); + const auto __in_keys = std::get<0>(__in_rng.tuple()); + const auto __in_vals = std::get<1>(__in_rng.tuple()); using _KeyType = oneapi::dpl::__internal::__value_t; using _ValueType = oneapi::dpl::__internal::__value_t; const _KeyType& __current_key = __in_keys[__id]; @@ -829,7 +832,7 @@ struct __gen_red_by_seg_scan_input { const _KeyType& __prev_key = __in_keys[__id - 1]; const _KeyType& __next_key = __in_keys[__id + 1]; - std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); + const std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); return oneapi::dpl::__internal::make_tuple( oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), !__binary_pred(__current_key, __next_key), __next_key, __current_key); @@ -837,7 +840,7 @@ struct __gen_red_by_seg_scan_input else if (__id == __n - 1) { const _KeyType& __prev_key = __in_keys[__id - 1]; - std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); + const std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); return oneapi::dpl::__internal::make_tuple( oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, __current_key, __current_key); // Passing __current_key as the next key for the last element is a placeholder @@ -1327,7 +1330,7 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), - _GenReduceInput{__binary_pred, __n}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{}, + _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{}, _WriteOp{__binary_pred, __n}, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); From 9864ab1d536de2ca9a0a761eb632dad1d39dd2b5 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 22 Oct 2024 12:16:18 -0700 Subject: [PATCH 21/34] Add condition to ensure value type is trivially copyable to call reduce-then-scan Signed-off-by: Matthew Michel --- .../hetero/algorithm_ranges_impl_hetero.h | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index 7179e1f864a..fc04e0296ff 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -944,26 +944,28 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& return 1; } + using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; + using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; + using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; #if _ONEDPL_BACKEND_SYCL // We would normally dispatch to the parallel implementation which would make the decision to invoke // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose // whether or not to use reduce-then-scan here. - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + if constexpr (std::is_trivially_copyable_v<__val_type>) { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), - std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), - __binary_pred, __binary_op); - __res.wait(); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + { + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), + __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + } } #endif - using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; - using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; - using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; - // Round 1: reduce with extra indices added to avoid long segments // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. // Skip a round of copy_if and reduces if there are none. From e620a694ff0fba8c56d9a45c21aa7508fe197073 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 22 Oct 2024 12:25:34 -0700 Subject: [PATCH 22/34] clang-format Signed-off-by: Matthew Michel --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index dc2253ea77f..7aff2957ad5 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1313,8 +1313,9 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t template auto -__parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, +__parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, + _ExecutionPolicy&& __exec, _Range1&& __keys, _Range2&& __values, + _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { using _GenReduceInput = __gen_red_by_seg_reduce_input<_BinaryPredicate>; @@ -1330,8 +1331,8 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), - _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{}, - _WriteOp{__binary_pred, __n}, + _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, + _ScanInputTransform{}, _WriteOp{__binary_pred, __n}, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } From cf3a2439f128b3e537895276a945b7b3cec2c5e7 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 5 Nov 2024 13:52:13 -0600 Subject: [PATCH 23/34] Introduce iterator based __pattern_reduce_by_segment * An iterator based __pattern_reduce_by_segment is added * Due to compiler issues prior to icpx 2025.0, the reduce-then-scan path is disabled and the previous handcrafted SYCL implementation is restored to prevent performance regressions with older compilers * The previous range-based fallback implementation has been moved to the SYCL backend along with the handcrafted SYCL version Signed-off-by: Matthew Michel --- .../dpl/internal/reduce_by_segment_impl.h | 16 +- .../dpl/pstl/hetero/algorithm_impl_hetero.h | 40 ++ .../hetero/algorithm_ranges_impl_hetero.h | 150 +----- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 176 +++++++ .../parallel_backend_sycl_reduce_by_segment.h | 464 ++++++++++++++++++ 5 files changed, 692 insertions(+), 154 deletions(-) create mode 100644 include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h diff --git a/include/oneapi/dpl/internal/reduce_by_segment_impl.h b/include/oneapi/dpl/internal/reduce_by_segment_impl.h index 8a04717c5c0..aa718e3743f 100644 --- a/include/oneapi/dpl/internal/reduce_by_segment_impl.h +++ b/include/oneapi/dpl/internal/reduce_by_segment_impl.h @@ -57,7 +57,7 @@ #include "../pstl/utils_ranges.h" #include "../pstl/hetero/dpcpp/utils_ranges_sycl.h" #include "../pstl/ranges_defs.h" -#include "../pstl/hetero/algorithm_ranges_impl_hetero.h" +#include "../pstl/hetero/algorithm_impl_hetero.h" #include "../pstl/hetero/dpcpp/sycl_traits.h" //SYCL traits specialization for some oneDPL types. #endif @@ -193,19 +193,9 @@ reduce_by_segment_impl(__internal::__hetero_tag<_BackendTag> __tag, Policy&& pol if (n == 0) return std::make_pair(result1, result2); - auto keep_keys = __ranges::__get_sycl_range<__bknd::access_mode::read, InputIterator1>(); - auto key_buf = keep_keys(first1, last1); - auto keep_values = __ranges::__get_sycl_range<__bknd::access_mode::read, InputIterator2>(); - auto value_buf = keep_values(first2, first2 + n); - auto keep_key_outputs = __ranges::__get_sycl_range<__bknd::access_mode::write, OutputIterator1>(); - auto key_output_buf = keep_key_outputs(result1, result1 + n); - auto keep_value_outputs = __ranges::__get_sycl_range<__bknd::access_mode::write, OutputIterator2>(); - auto value_output_buf = keep_value_outputs(result2, result2 + n); - // number of unique keys - _CountType __n = oneapi::dpl::__internal::__ranges::__pattern_reduce_by_segment( - __tag, std::forward(policy), key_buf.all_view(), value_buf.all_view(), key_output_buf.all_view(), - value_output_buf.all_view(), binary_pred, binary_op); + _CountType __n = oneapi::dpl::__internal::__pattern_reduce_by_segment( + __tag, std::forward(policy), first1, last1, first2, result1, result2, binary_pred, binary_op); return std::make_pair(result1 + __n, result2 + __n); } diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index 1a51076c612..d853f057561 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -2003,6 +2003,46 @@ __pattern_shift_right(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec return __last - __res; } +template +struct __copy_keys_values_wrapper; + +template +typename std::iterator_traits<_Iterator3>::difference_type +__pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Iterator1 __keys_first, + _Iterator1 __keys_last, _Iterator2 __values_first, _Iterator3 __out_keys_first, + _Iterator4 __out_values_first, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +{ + std::size_t __n = std::distance(__keys_first, __keys_last); + + if (__n == 0) + return 0; + + if (__n == 1) + { + __brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy> __copy_op{}; + + oneapi::dpl::__internal::__pattern_walk2_n( + __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_values_wrapper>(__exec), + oneapi::dpl::make_zip_iterator(__keys_first, __values_first), 1, + oneapi::dpl::make_zip_iterator(__out_keys_first, __out_values_first), __copy_op); + + return 1; + } + + auto __keep_keys = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); + auto __keys = __keep_keys(__keys_first, __keys_last); + auto __keep_values = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator2>(); + auto __values = __keep_values(__values_first, __values_first + __n); + auto __keep_key_outputs = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator3>(); + auto __out_keys = __keep_key_outputs(__out_keys_first, __out_keys_first + __n); + auto __keep_value_outputs = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator4>(); + auto __out_values = __keep_value_outputs(__out_values_first, __out_values_first + __n); + return oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __keys.all_view(), __values.all_view(), + __out_keys.all_view(), __out_values.all_view(), __binary_pred, __binary_op); +} + } // namespace __internal } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h index fc04e0296ff..da7820b91a2 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h @@ -889,22 +889,7 @@ __pattern_minmax_element(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _ //------------------------------------------------------------------------ template -struct __copy_keys_wrapper; - -template -struct __copy_values_wrapper; - -template -struct __reduce1_wrapper; - -template -struct __reduce2_wrapper; - -template -struct __assign_key1_wrapper; - -template -struct __assign_key2_wrapper; +struct __copy_keys_values_range_wrapper; template @@ -932,135 +917,18 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy> __copy_range{}; oneapi::dpl::__internal::__ranges::__pattern_walk_n( - __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_wrapper>(__exec), __copy_range, - std::forward<_Range1>(__keys), std::forward<_Range3>(__out_keys)); - - oneapi::dpl::__internal::__ranges::__pattern_walk_n( - __tag, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_values_wrapper>( - std::forward<_ExecutionPolicy>(__exec)), - __copy_range, std::forward<_Range2>(__values), std::forward<_Range4>(__out_values)); + __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_values_range_wrapper>(__exec), + __copy_range, + oneapi::dpl::__ranges::zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), + oneapi::dpl::__ranges::zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values))); return 1; } - using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; - using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; - using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; -#if _ONEDPL_BACKEND_SYCL - // We would normally dispatch to the parallel implementation which would make the decision to invoke - // reduce-then-scan. However, since the fallback is implemented at the ranges level we must choose - // whether or not to use reduce-then-scan here. - if constexpr (std::is_trivially_copyable_v<__val_type>) - { - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) - { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), - std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), - __binary_pred, __binary_op); - __res.wait(); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; - } - } -#endif - // Round 1: reduce with extra indices added to avoid long segments - // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. - // Skip a round of copy_if and reduces if there are none. - auto __idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); - auto __tmp_out_keys = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer(); - auto __tmp_out_values = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer(); - - // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences, - // dropping the last key for the i-1 sequence. - auto __k1 = - oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n); - - // view1 elements are a tuple of the element index and pairs of adjacent keys - // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if - auto __view1 = experimental::ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys); - auto __view2 = experimental::ranges::zip_view(experimental::ranges::views::all_write(__tmp_out_keys), - experimental::ranges::views::all_write(__idx)); - - // use work group size adjusted to shared local memory as the maximum segment size. - std::size_t __wgroup_size = - oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type)); - - // element is copied if it is the 0th element (marks beginning of first segment), is in an index - // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the - // adjacent element (marks end of real segments) - // TODO: replace wgroup size with segment size based on platform specifics. - auto __intermediate_result_end = __ranges::__pattern_copy_if( - __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2, - [__binary_pred, __wgroup_size](const auto& __a) { - // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys - // for (i-1), but we still need to get its key value as it is the start of a segment - const auto index = std::get<0>(__a); - if (index == 0) - return true; - return index % __wgroup_size == 0 // segment size - || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison - }, - unseq_backend::__brick_assign_key_position{}); - - //reduce by segment - oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end, - oneapi::dpl::__ranges::take_view_simple(experimental::ranges::views::all_read(__idx), - __intermediate_result_end), - std::forward<_Range2>(__values), experimental::ranges::views::all_write(__tmp_out_values)) - .wait(); - - // Round 2: final reduction to get result for each segment of equal adjacent keys - // create views over adjacent keys - oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys( - __tmp_out_keys); - - // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key, - // dropping the last key for the i-1 sequence. Only taking the appropriate number of keys to start with here. - auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end); - - auto __k3 = oneapi::dpl::__ranges::take_view_simple( - oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end); - - // view3 elements are a tuple of the element index and pairs of adjacent keys - // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if - auto __view3 = experimental::ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), __k3, - __clipped_new_keys); - auto __view4 = experimental::ranges::zip_view(experimental::ranges::views::all_write(__out_keys), - experimental::ranges::views::all_write(__idx)); - - // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to - // the adjacent element (end of a segment). Artificial segments based on wg size are not created. - auto __result_end = __ranges::__pattern_copy_if( - __tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), __view3, __view4, - [__binary_pred](const auto& __a) { - // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys - // for (i-1), but we still need to get its key value as it is the start of a segment - if (std::get<0>(__a) == 0) - return true; - return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison - }, - unseq_backend::__brick_assign_key_position{}); - - //reduce by segment - oneapi::dpl::__par_backend_hetero::__parallel_for( - _BackendTag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>( - std::forward<_ExecutionPolicy>(__exec)), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>( - __binary_op, __intermediate_result_end), - __result_end, - oneapi::dpl::__ranges::take_view_simple(experimental::ranges::views::all_read(__idx), __result_end), - experimental::ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values)) - .__deferrable_wait(); - - return __result_end; + return oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment( + _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), + std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), + __binary_pred, __binary_op); } } // namespace __ranges diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 7aff2957ad5..d25907446d8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -32,12 +32,14 @@ #include "../../iterator_impl.h" #include "../../execution_impl.h" #include "../../utils_ranges.h" +#include "../../ranges_defs.h" #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" #include "parallel_backend_sycl_reduce.h" #include "parallel_backend_sycl_merge.h" #include "parallel_backend_sycl_merge_sort.h" +#include "parallel_backend_sycl_reduce_by_segment.h" #include "parallel_backend_sycl_reduce_then_scan.h" #include "execution_sycl_defs.h" #include "sycl_iterator.h" @@ -2275,6 +2277,180 @@ __parallel_partial_sort(oneapi::dpl::__internal::__device_backend_tag __backend_ return __parallel_partial_sort_impl(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), __buf.all_view(), __partial_merge_kernel{__mid_idx}, __comp); } + +//------------------------------------------------------------------------ +// reduce_by_segment - sync pattern +//------------------------------------------------------------------------ + +// TODO: The non-identity fallback path of reduce-by-segment must currently be implemented synchronously due to the +// inability to create event dependency chains across separate parallel pattern calls. If we ever add support for +// cross parallel pattern dependencies, then we can implement this as an async pattern. +template +struct __reduce1_wrapper; + +template +struct __reduce2_wrapper; + +template +struct __assign_key1_wrapper; + +template +struct __assign_key2_wrapper; + +template +oneapi::dpl::__internal::__difference_t<_Range3> +__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, + _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +{ + // The algorithm reduces values in __values where the + // associated keys for the values are equal to the adjacent key. + // + // Example: __keys = { 1, 2, 3, 4, 1, 1, 3, 3, 1, 1, 3, 3, 0 } + // __values = { 1, 2, 3, 4, 1, 1, 3, 3, 1, 1, 3, 3, 0 } + // + // __out_keys = { 1, 2, 3, 4, 1, 3, 1, 3, 0 } + // __out_values = { 1, 2, 3, 4, 2, 6, 2, 6, 0 } + + const auto __n = __keys.size(); + + using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; + using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; + using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; + // Prior to icpx 2025.0, the reduce-then-scan path performs poorly and should be avoided. +#if !defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000 + if constexpr (std::is_trivially_copyable_v<__val_type>) + { + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + { + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + } + } +#endif + if constexpr (oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>::value) + { + return __parallel_reduce_by_segment_known_identity( + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op); + } + else + { + // Round 1: reduce with extra indices added to avoid long segments + // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. + // Skip a round of copy_if and reduces if there are none. + auto __idx = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); + auto __tmp_out_keys = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer(); + auto __tmp_out_values = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer(); + + // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences, + // dropping the last key for the i-1 sequence. + auto __k1 = + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n); + + // view1 elements are a tuple of the element index and pairs of adjacent keys + // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if + auto __view1 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys); + auto __view2 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__tmp_out_keys), + oneapi::dpl::__ranges::views::all_write(__idx)); + + // use work group size adjusted to shared local memory as the maximum segment size. + std::size_t __wgroup_size = + oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type)); + + // element is copied if it is the 0th element (marks beginning of first segment), is in an index + // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the + // adjacent element (marks end of real segments) + // TODO: replace wgroup size with segment size based on platform specifics. + auto __intermediate_result_end = + oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2, + __n, + [__binary_pred, __wgroup_size](const auto& __a) { + // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys + // for (i-1), but we still need to get its key value as it is the start of a segment + const auto index = std::get<0>(__a); + if (index == 0) + return true; + return index % __wgroup_size == 0 // segment size + || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison + }, + unseq_backend::__brick_assign_key_position{}) + .get(); + + //reduce by segment + oneapi::dpl::__par_backend_hetero::__parallel_for( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), + __intermediate_result_end, + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), + __intermediate_result_end), + std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values)) + .wait(); + + // Round 2: final reduction to get result for each segment of equal adjacent keys + // create views over adjacent keys + oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys( + __tmp_out_keys); + + // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key, + // dropping the last key for the i-1 sequence. Only taking the appropriate number of keys to start with here. + auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end); + + auto __k3 = oneapi::dpl::__ranges::take_view_simple( + oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end); + + // view3 elements are a tuple of the element index and pairs of adjacent keys + // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if + auto __view3 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), + __k3, __clipped_new_keys); + auto __view4 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__out_keys), + oneapi::dpl::__ranges::views::all_write(__idx)); + + // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to + // the adjacent element (end of a segment). Artificial segments based on wg size are not created. + auto __result_end = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), + __view3, __view4, __view3.size(), + [__binary_pred](const auto& __a) { + // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys + // for (i-1), but we still need to get its key value as it is the start of a segment + if (std::get<0>(__a) == 0) + return true; + return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison + }, + unseq_backend::__brick_assign_key_position{}) + .get(); + + //reduce by segment + oneapi::dpl::__par_backend_hetero::__parallel_for( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>( + std::forward<_ExecutionPolicy>(__exec)), + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>( + __binary_op, __intermediate_result_end), + __result_end, + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end), + oneapi::dpl::__ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values)) + .__deferrable_wait(); + return __result_end; + } +} + } // namespace __par_backend_hetero } // namespace dpl } // namespace oneapi diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h new file mode 100644 index 00000000000..14860e3830a --- /dev/null +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h @@ -0,0 +1,464 @@ +// -*- C++ -*- +//===-- parallel_backend_sycl_reduce_by_segment.h ---------------------------------===// +/* Copyright (c) Intel Corporation + * + *  Copyright 2008-2013 NVIDIA Corporation + * + *  Licensed under the Apache License, Version 2.0 (the "License"); + *  you may not use this file except in compliance with the License. + *  You may obtain a copy of the License at + * + *      http://www.apache.org/licenses/LICENSE-2.0 + * + *  Unless required by applicable law or agreed to in writing, software + *  distributed under the License is distributed on an "AS IS" BASIS, + *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + *  See the License for the specific language governing permissions and + *  limitations under the License. + * + *  Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved. + *  + *  Redistribution and use in source and binary forms, with or without + *  modification, are permitted provided that the following conditions are met: + *     * Redistributions of source code must retain the above copyright + *       notice, this list of conditions and the following disclaimer. + *     * Redistributions in binary form must reproduce the above copyright + *       notice, this list of conditions and the following disclaimer in the + *       documentation and/or other materials provided with the distribution. + *     * Neither the name of the NVIDIA CORPORATION nor the + *       names of its contributors may be used to endorse or promote products + *       derived from this software without specific prior written permission. + *  + *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"  + *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE  + *  ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + *  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + *  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_BY_SEGMENT_H +#define _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_BY_SEGMENT_H + +#include +#include +#include +#include +#include + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" +#include "utils_ranges_sycl.h" +#include "sycl_traits.h" + +#include "../../utils.h" +#include "../../../internal/scan_by_segment_impl.h" + +namespace oneapi +{ +namespace dpl +{ +namespace __par_backend_hetero +{ + +template +class __seg_reduce_count_kernel; +template +class __seg_reduce_offset_kernel; +template +class __seg_reduce_wg_kernel; +template +class __seg_reduce_prefix_kernel; + +namespace +{ +template +using _SegReduceCountPhase = __seg_reduce_count_kernel<_Name...>; +template +using _SegReduceOffsetPhase = __seg_reduce_offset_kernel<_Name...>; +template +using _SegReduceWgPhase = __seg_reduce_wg_kernel<_Name...>; +template +using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; +} // namespace + +template +oneapi::dpl::__internal::__difference_t<_Range3> +__parallel_reduce_by_segment_known_identity(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, + _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + + using _SegReduceCountKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _SegReduceCountPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, + _BinaryOperator>; + using _SegReduceOffsetKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _SegReduceOffsetPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, + _BinaryOperator>; + using _SegReduceWgKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _SegReduceWgPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, + _BinaryOperator>; + using _SegReducePrefixKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + _SegReducePrefixPhase, _CustomName, _ExecutionPolicy, _Range1, _Range2, _Range3, _Range4, _BinaryPredicate, + _BinaryOperator>; + + using __diff_type = oneapi::dpl::__internal::__difference_t<_Range3>; + using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; + using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; + + const std::size_t __n = __keys.size(); + + constexpr std::uint16_t __vals_per_item = + 16; // Each work item serially processes 16 items. Best observed performance on gpu + + // Limit the work-group size to prevent large sizes on CPUs. Empirically found value. + // This value exceeds the current practical limit for GPUs, but may need to be re-evaluated in the future. + std::size_t __wgroup_size = oneapi::dpl::__internal::__max_work_group_size(__exec, (std::size_t)2048); + + // adjust __wgroup_size according to local memory limit. Double the requirement on __val_type due to sycl group algorithm's use + // of SLM. + __wgroup_size = oneapi::dpl::__internal::__slm_adjusted_work_group_size( + __exec, sizeof(__key_type) + 2 * sizeof(__val_type), __wgroup_size); + +#if _ONEDPL_COMPILE_KERNEL + auto __seg_reduce_count_kernel = + __par_backend_hetero::__internal::__kernel_compiler<_SegReduceCountKernel>::__compile(__exec); + auto __seg_reduce_offset_kernel = + __par_backend_hetero::__internal::__kernel_compiler<_SegReduceOffsetKernel>::__compile(__exec); + auto __seg_reduce_wg_kernel = + __par_backend_hetero::__internal::__kernel_compiler<_SegReduceWgKernel>::__compile(__exec); + auto __seg_reduce_prefix_kernel = + __par_backend_hetero::__internal::__kernel_compiler<_SegReducePrefixKernel>::__compile(__exec); + __wgroup_size = + std::min({__wgroup_size, + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_count_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_offset_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_wg_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_prefix_kernel)}); +#endif + + std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __wgroup_size * __vals_per_item); + + // intermediate reductions within a workgroup + auto __partials = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n_groups).get_buffer(); + + auto __end_idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, 1).get_buffer(); + + // the number of segment ends found in each work group + auto __seg_ends = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n_groups).get_buffer(); + + // buffer that stores an exclusive scan of the results + auto __seg_ends_scanned = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n_groups).get_buffer(); + + // 1. Count the segment ends in each workgroup + auto __seg_end_identification = __exec.queue().submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __keys); + auto __seg_ends_acc = __seg_ends.template get_access(__cgh); +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT + __cgh.use_kernel_bundle(__seg_reduce_count_kernel.get_kernel_bundle()); +#endif + __cgh.parallel_for<_SegReduceCountKernel>( + sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=]( +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT + __seg_reduce_count_kernel, +#endif + sycl::nd_item<1> __item) { + auto __group = __item.get_group(); + std::size_t __group_id = __item.get_group(0); + std::size_t __local_id = __item.get_local_id(0); + std::size_t __global_id = __item.get_global_id(0); + + std::size_t __start = __global_id * __vals_per_item; + std::size_t __end = __dpl_sycl::__minimum{}(__start + __vals_per_item, __n); + std::size_t __item_segments = 0; + + // 1a. Work item scan to identify segment ends + for (std::size_t __i = __start; __i < __end; ++__i) + if (__n - 1 == __i || !__binary_pred(__keys[__i], __keys[__i + 1])) + ++__item_segments; + + // 1b. Work group reduction + std::size_t __num_segs = __dpl_sycl::__reduce_over_group( + __group, __item_segments, __dpl_sycl::__plus()); + + // 1c. First work item writes segment count to global memory + if (__local_id == 0) + __seg_ends_acc[__group_id] = __num_segs; + }); + }); + + // 1.5 Small single-group kernel + auto __single_group_scan = __exec.queue().submit([&](sycl::handler& __cgh) { + __cgh.depends_on(__seg_end_identification); + auto __seg_ends_acc = __seg_ends.template get_access(__cgh); + auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT + __cgh.use_kernel_bundle(__seg_reduce_offset_kernel.get_kernel_bundle()); +#endif + __cgh.parallel_for<_SegReduceOffsetKernel>( +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT + __seg_reduce_offset_kernel, +#endif + sycl::nd_range<1>{__wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { + auto __beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_acc); + auto __out_beg = __dpl_sycl::__get_accessor_ptr(__seg_ends_scan_acc); + __dpl_sycl::__joint_exclusive_scan(__item.get_group(), __beg, __beg + __n_groups, __out_beg, + __diff_type(0), sycl::plus<__diff_type>()); + }); + }); + + // 2. Work group reduction + auto __wg_reduce = __exec.queue().submit([&](sycl::handler& __cgh) { + __cgh.depends_on(__single_group_scan); + oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values, __values); + + auto __partials_acc = __partials.template get_access(__cgh); + auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); + __dpl_sycl::__local_accessor<__val_type> __loc_acc(2 * __wgroup_size, __cgh); +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT + __cgh.use_kernel_bundle(__seg_reduce_wg_kernel.get_kernel_bundle()); +#endif + __cgh.parallel_for<_SegReduceWgKernel>( +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT + __seg_reduce_wg_kernel, +#endif + sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { + std::array<__val_type, __vals_per_item> __loc_partials; + + auto __group = __item.get_group(); + std::size_t __group_id = __item.get_group(0); + std::size_t __local_id = __item.get_local_id(0); + std::size_t __global_id = __item.get_global_id(0); + + // 2a. Lookup the number of prior segs + auto __wg_num_prior_segs = __seg_ends_scan_acc[__group_id]; + + // 2b. Perform a serial scan within the work item over assigned elements. Store partial + // reductions in work group local memory. + std::size_t __start = __global_id * __vals_per_item; + std::size_t __end = __dpl_sycl::__minimum{}(__start + __vals_per_item, __n); + + std::size_t __max_end = 0; + std::size_t __item_segments = 0; + auto __identity = unseq_backend::__known_identity<_BinaryOperator, __val_type>; + + __val_type __accumulator = __identity; + for (std::size_t __i = __start; __i < __end; ++__i) + { + __accumulator = __binary_op(__accumulator, __values[__i]); + if (__n - 1 == __i || !__binary_pred(__keys[__i], __keys[__i + 1])) + { + __loc_partials[__i - __start] = __accumulator; + ++__item_segments; + __max_end = __local_id; + __accumulator = __identity; + } + } + + // 2c. Count the number of prior work segments cooperatively over group + std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group( + __group, __item_segments, __dpl_sycl::__plus()); + std::size_t __start_idx = __wg_num_prior_segs + __prior_segs_in_wg; + + // 2d. Find the greatest segment end less than the current index (inclusive) + std::size_t __closest_seg_id = __dpl_sycl::__inclusive_scan_over_group( + __group, __max_end, __dpl_sycl::__maximum()); + + // __wg_segmented_scan is a derivative work and responsible for the third header copyright + __val_type __carry_in = oneapi::dpl::internal::__wg_segmented_scan( + __item, __loc_acc, __local_id, __local_id - __closest_seg_id, __accumulator, __identity, + __binary_op, __wgroup_size); + + // 2e. Update local partial reductions in first segment and write to global memory. + bool __apply_aggs = true; + std::size_t __item_offset = 0; + + // first item in group does not have any work-group aggregates to apply + if (__local_id == 0) + { + __apply_aggs = false; + if (__global_id == 0 && __n > 0) + { + // first segment identifier is always the first key + __out_keys[0] = __keys[0]; + } + } + + // apply the aggregates and copy the locally stored values to destination buffer + for (std::size_t __i = __start; __i < __end; ++__i) + { + if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1])) + { + std::size_t __idx = __start_idx + __item_offset; + if (__apply_aggs) + { + __out_values[__idx] = __binary_op(__carry_in, __loc_partials[__i - __start]); + __apply_aggs = false; + } + else + { + __out_values[__idx] = __loc_partials[__i - __start]; + } + if (__i != __n - 1) + { + __out_keys[__idx + 1] = __keys[__i + 1]; + } + ++__item_offset; + } + } + + // 2f. Output the work group aggregate and total number of segments for use in phase 3. + if (__local_id == __wgroup_size - 1) // last work item writes the group's carry out + { + // If no segment ends in the item, the aggregates from previous work groups must be applied. + if (__max_end == 0) + { + // needs to be inclusive with last element + __partials_acc[__group_id] = __binary_op(__carry_in, __accumulator); + } + else + { + __partials_acc[__group_id] = __accumulator; + } + } + }); + }); + + // 3. Apply inter work-group aggregates + __exec.queue() + .submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __keys, __out_keys, __out_values); + + auto __partials_acc = __partials.template get_access(__cgh); + auto __seg_ends_scan_acc = __seg_ends_scanned.template get_access(__cgh); + auto __seg_ends_acc = __seg_ends.template get_access(__cgh); + auto __end_idx_acc = __end_idx.template get_access(__cgh); + + __dpl_sycl::__local_accessor<__val_type> __loc_partials_acc(__wgroup_size, __cgh); + __dpl_sycl::__local_accessor<__diff_type> __loc_seg_ends_acc(__wgroup_size, __cgh); + + __cgh.depends_on(__wg_reduce); +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT + __cgh.use_kernel_bundle(__seg_reduce_prefix_kernel.get_kernel_bundle()); +#endif + __cgh.parallel_for<_SegReducePrefixKernel>( +#if _ONEDPL_COMPILE_KERNEL && !_ONEDPL_KERNEL_BUNDLE_PRESENT + __seg_reduce_prefix_kernel, +#endif + sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) { + auto __group = __item.get_group(); + std::int64_t __group_id = __item.get_group(0); + std::size_t __global_id = __item.get_global_id(0); + std::size_t __local_id = __item.get_local_id(0); + + std::size_t __start = __global_id * __vals_per_item; + std::size_t __end = __dpl_sycl::__minimum{}(__start + __vals_per_item, __n); + std::size_t __item_segments = 0; + + std::int64_t __wg_agg_idx = __group_id - 1; + __val_type __agg_collector = unseq_backend::__known_identity<_BinaryOperator, __val_type>; + + bool __ag_exists = false; + // 3a. Check to see if an aggregate exists and compute that value in the first + // work item. + if (__group_id != 0) + { + __ag_exists = __start < __n; + // local reductions followed by a sweep + constexpr std::int32_t __vals_to_explore = 16; + bool __last_it = false; + __loc_seg_ends_acc[__local_id] = false; + __loc_partials_acc[__local_id] = unseq_backend::__known_identity<_BinaryOperator, __val_type>; + for (std::int32_t __i = __wg_agg_idx - __vals_to_explore * __local_id; !__last_it; + __i -= __wgroup_size * __vals_to_explore) + { + __val_type __local_collector = unseq_backend::__known_identity<_BinaryOperator, __val_type>; + // exploration phase + for (std::int32_t __j = __i; + __j > __dpl_sycl::__maximum{}(-1L, __i - __vals_to_explore); --__j) + { + __local_collector = __binary_op(__partials_acc[__j], __local_collector); + if (__seg_ends_acc[__j] || __j == 0) + { + __loc_seg_ends_acc[__local_id] = true; + break; + } + } + __loc_partials_acc[__local_id] = __local_collector; + __dpl_sycl::__group_barrier(__item); + // serial aggregate collection and synchronization + if (__local_id == 0) + { + for (std::size_t __j = 0; __j < __wgroup_size; ++__j) + { + __agg_collector = __binary_op(__loc_partials_acc[__j], __agg_collector); + if (__loc_seg_ends_acc[__j]) + { + __last_it = true; + break; + } + } + } + __agg_collector = __dpl_sycl::__group_broadcast(__item.get_group(), __agg_collector); + __last_it = __dpl_sycl::__group_broadcast(__item.get_group(), __last_it); + } + + // Check to see if aggregates exist. + // The last group must always stay to write the final index + __ag_exists = __dpl_sycl::__any_of_group(__group, __ag_exists); + if (!__ag_exists && __group_id != __n_groups - 1) + return; + } + // 3b. count the segment ends + for (std::size_t __i = __start; __i < __end; ++__i) + if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1])) + ++__item_segments; + + std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group( + __group, __item_segments, __dpl_sycl::__plus()); + + // 3c. Determine prior index + std::size_t __wg_num_prior_segs = __seg_ends_scan_acc[__group_id]; + + // 3d. Second pass over the keys, reidentifying end segments and applying work group + // aggregates if appropriate. Both the key and reduction value are written to the final output at the + // computed index + std::size_t __item_offset = 0; + for (std::size_t __i = __start; __i < __end; ++__i) + { + if (__i == __n - 1 || !__binary_pred(__keys[__i], __keys[__i + 1])) + { + std::size_t __idx = __wg_num_prior_segs + __prior_segs_in_wg + __item_offset; + + // apply the aggregate if it is the first segment end in the workgroup only + if (__prior_segs_in_wg == 0 && __item_offset == 0 && __ag_exists) + __out_values[__idx] = __binary_op(__agg_collector, __out_values[__idx]); + + ++__item_offset; + // the last item must write the last index's position to return + if (__i == __n - 1) + __end_idx_acc[0] = __idx; + } + } + }); + }) + .wait(); + + return __end_idx.get_host_access()[0] + 1; +} + +} // namespace __par_backend_hetero +} // namespace dpl +} // namespace oneapi + +#endif + From bea3d38e6f9ada4d70b296f01edbb83159b483e5 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 5 Nov 2024 13:57:48 -0600 Subject: [PATCH 24/34] Revert "Remove now unneeded ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION macro" This reverts commit a4c783533fcd8675094f77681d6d00dc3d23c0f7. --- CMakeLists.txt | 10 ++++++++++ cmake/README.md | 1 + .../pstl/hetero/dpcpp/unseq_backend_sycl.h | 17 +++++++++++++--- test/CMakeLists.txt | 5 +++++ .../numeric.ops/reduce_by_segment.pass.cpp | 20 ++++++++++++++++--- 5 files changed, 47 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d8686e3eb47..cbe9a214a1f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -293,6 +293,16 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$") endif() endif() + if (DEFINED ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) + if(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) + message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1 option") + target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1") + else() + message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0 option") + target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0") + endif() + endif() + # DPC++ specific macro target_compile_definitions(oneDPL INTERFACE $<$,$>:ONEDPL_FPGA_DEVICE> diff --git a/cmake/README.md b/cmake/README.md index 0683a377820..7335b7e2312 100644 --- a/cmake/README.md +++ b/cmake/README.md @@ -18,6 +18,7 @@ The following variables are provided for oneDPL configuration: | ONEDPL_AOT_ARCH | STRING | Architecture options for ahead-of-time compilation, supported values can be found [here](https://software.intel.com/content/www/us/en/develop/documentation/oneapi-dpcpp-cpp-compiler-dev-guide-and-reference/top/compilation/ahead-of-time-compilation.html) | "*" for GPU device and "avx" for CPU device | | ONEDPL_TEST_EXPLICIT_KERNEL_NAMES | STRING | Control kernel naming. Affects only oneDPL test targets. Supported values: AUTO, ALWAYS. AUTO: rely on the compiler if "Unnamed SYCL lambda kernels" feature is on, otherwise provide kernel names explicitly; ALWAYS: provide kernel names explicitly | AUTO | | ONEDPL_TEST_WIN_ICX_FIXES | BOOL | Affects only oneDPL test targets. Enable icx, icx-cl workarounds to fix issues in CMake for Windows. | ON | +| ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION | BOOL | Use as a workaround for incorrect results, which may be produced by reduction algorithms with 64-bit data types compiled by the Intel® oneAPI DPC++/C++ Compiler and executed on GPU devices. | | Some useful CMake variables ([here](https://cmake.org/cmake/help/latest/manual/cmake-variables.7.html) you can find a full list of CMake variables for the latest version): diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h index 9bf4325f221..2caa6add318 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h @@ -34,11 +34,21 @@ namespace unseq_backend //This optimization depends on Intel(R) oneAPI DPC++ Compiler implementation such as support of binary operators from std namespace. //We need to use defined(SYCL_IMPLEMENTATION_INTEL) macro as a guard. +template +inline constexpr bool __can_use_known_identity = +# if ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION + // When ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION is defined as non-zero, we avoid using known identity for 64-bit arithmetic data types + !(::std::is_arithmetic_v<_Tp> && sizeof(_Tp) == sizeof(::std::uint64_t)); +# else + true; +# endif // ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION + //TODO: To change __has_known_identity implementation as soon as the Intel(R) oneAPI DPC++ Compiler implementation issues related to //std::multiplies, std::bit_or, std::bit_and and std::bit_xor operations will be fixed. //std::logical_and and std::logical_or are not supported in Intel(R) oneAPI DPC++ Compiler to be used in sycl::inclusive_scan_over_group and sycl::reduce_over_group template -using __has_known_identity = +using __has_known_identity = ::std::conditional_t< + __can_use_known_identity<_Tp>, # if _ONEDPL_LIBSYCL_VERSION >= 50200 typename ::std::disjunction< __dpl_sycl::__has_known_identity<_BinaryOp, _Tp>, @@ -50,15 +60,16 @@ using __has_known_identity = ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<_Tp>>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<_Tp>>, - ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum>>>>; + ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum>>>>, # else //_ONEDPL_LIBSYCL_VERSION >= 50200 typename ::std::conjunction< ::std::is_arithmetic<_Tp>, ::std::disjunction<::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<_Tp>>, ::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus>, ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<_Tp>>, - ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus>>>; + ::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus>>>, # endif //_ONEDPL_LIBSYCL_VERSION >= 50200 + ::std::false_type>; // This is for the case of __can_use_known_identity<_Tp>==false #else //_ONEDPL_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e85e8e9f5f8..90eb3d5c737 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -195,6 +195,7 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators) string(REPLACE "\.cpp" "" _test_name ${_test_name}) set(coal_tests "reduce.pass" "transform_reduce.pass" "count.pass" "sycl_iterator_reduce.pass" "minmax_element.pass") + set(workaround_for_igpu_64bit_reduction_tests "reduce_by_segment.pass") # mark those tests with pstloffload_smoke_tests label set (pstloffload_smoke_tests "adjacent_find.pass" "copy_move.pass" "merge.pass" "partial_sort.pass" "remove_copy.pass" "transform_reduce.pass" "transform_reduce.pass.coal" "transform_scan.pass" "algorithm.pass" @@ -208,6 +209,10 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators) if (_test_name IN_LIST coal_tests) onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=1" "${extra_test_label}") onedpl_construct_exec(${test_source_file} ${_test_name}.coal ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=0" "${extra_test_label}") + elseif (_test_name IN_LIST workaround_for_igpu_64bit_reduction_tests) + onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}") + string(REPLACE "\.pass" "_workaround_64bit_reduction\.pass" _test_name ${_test_name}) + onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1" "${extra_test_label}") elseif(_test_name STREQUAL "free_after_unload.pass") onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}") onedpl_construct_exec(${test_source_file} ${_test_name}.after_pstl_offload ${switch_off_checked_iterators} "" "${extra_test_label}") diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp index 2cee63239b6..f71f78ed26e 100644 --- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp @@ -13,6 +13,14 @@ // //===----------------------------------------------------------------------===// +#if defined(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) +#undef ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION +#endif + +#if defined(_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION) +# define ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION _ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION +#endif + #include "support/test_config.h" #include "oneapi/dpl/execution" @@ -298,10 +306,16 @@ void run_test_on_device() { #if TEST_DPCPP_BACKEND_PRESENT - if (TestUtils::has_type_support(TestUtils::get_test_queue().get_device())) + // Skip 64-byte types testing when the algorithm is broken and there is no the workaround +#if _PSTL_ICPX_TEST_RED_BY_SEG_BROKEN_64BIT_TYPES && !ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION + if constexpr (sizeof(ValueType) != 8) +#endif { - constexpr sycl::usm::alloc allocation_type = use_device_alloc ? sycl::usm::alloc::device : sycl::usm::alloc::shared; - test4buffers>(); + if (TestUtils::has_type_support(TestUtils::get_test_queue().get_device())) + { + constexpr sycl::usm::alloc allocation_type = use_device_alloc ? sycl::usm::alloc::device : sycl::usm::alloc::shared; + test4buffers>(); + } } #endif // TEST_DPCPP_BACKEND_PRESENT } From 70723ea5af364aaed4dd91bc0558bf3d840c5180 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 5 Nov 2024 14:00:41 -0600 Subject: [PATCH 25/34] Fix test bug where device allocation is always used for testing Signed-off-by: Matthew Michel --- .../parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp index f71f78ed26e..80aa9f53d3c 100644 --- a/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp +++ b/test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp @@ -314,7 +314,7 @@ run_test_on_device() if (TestUtils::has_type_support(TestUtils::get_test_queue().get_device())) { constexpr sycl::usm::alloc allocation_type = use_device_alloc ? sycl::usm::alloc::device : sycl::usm::alloc::shared; - test4buffers>(); + test4buffers>(); } } #endif // TEST_DPCPP_BACKEND_PRESENT From 76deb6f550b6e3adacca153752b5f44d0c926bae Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 13 Nov 2024 13:45:21 -0600 Subject: [PATCH 26/34] Separate each reduce_by_segment fallback path into their own functions Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 239 +++++++++--------- .../parallel_backend_sycl_reduce_by_segment.h | 7 +- 2 files changed, 129 insertions(+), 117 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d25907446d8..eadc79125be 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -2297,6 +2297,126 @@ struct __assign_key1_wrapper; template struct __assign_key2_wrapper; +template +oneapi::dpl::__internal::__difference_t<_Range3> +__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, + _Range4&& __out_values, _BinaryPredicate __binary_pred, + _BinaryOperator __binary_op, + /*known_identity=*/std::false_type) +{ + using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; + using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; + using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; + + const auto __n = __keys.size(); + // Round 1: reduce with extra indices added to avoid long segments + // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. + // Skip a round of copy_if and reduces if there are none. + auto __idx = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); + auto __tmp_out_keys = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer(); + auto __tmp_out_values = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer(); + + // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences, + // dropping the last key for the i-1 sequence. + auto __k1 = + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n); + + // view1 elements are a tuple of the element index and pairs of adjacent keys + // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if + auto __view1 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys); + auto __view2 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__tmp_out_keys), + oneapi::dpl::__ranges::views::all_write(__idx)); + + // use work group size adjusted to shared local memory as the maximum segment size. + std::size_t __wgroup_size = + oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type)); + + // element is copied if it is the 0th element (marks beginning of first segment), is in an index + // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the + // adjacent element (marks end of real segments) + // TODO: replace wgroup size with segment size based on platform specifics. + auto __intermediate_result_end = + oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2, + __n, + [__binary_pred, __wgroup_size](const auto& __a) { + // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys + // for (i-1), but we still need to get its key value as it is the start of a segment + const auto index = std::get<0>(__a); + if (index == 0) + return true; + return index % __wgroup_size == 0 // segment size + || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison + }, + unseq_backend::__brick_assign_key_position{}) + .get(); + + //reduce by segment + oneapi::dpl::__par_backend_hetero::__parallel_for( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), + __intermediate_result_end, + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), + __intermediate_result_end), + std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values)) + .wait(); + + // Round 2: final reduction to get result for each segment of equal adjacent keys + // create views over adjacent keys + oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys( + __tmp_out_keys); + + // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key, + // dropping the last key for the i-1 sequence. Only taking the appropriate number of keys to start with here. + auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end); + + auto __k3 = oneapi::dpl::__ranges::take_view_simple( + oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end); + + // view3 elements are a tuple of the element index and pairs of adjacent keys + // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if + auto __view3 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), + __k3, __clipped_new_keys); + auto __view4 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__out_keys), + oneapi::dpl::__ranges::views::all_write(__idx)); + + // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to + // the adjacent element (end of a segment). Artificial segments based on wg size are not created. + auto __result_end = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), + __view3, __view4, __view3.size(), + [__binary_pred](const auto& __a) { + // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys + // for (i-1), but we still need to get its key value as it is the start of a segment + if (std::get<0>(__a) == 0) + return true; + return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison + }, + unseq_backend::__brick_assign_key_position{}) + .get(); + + //reduce by segment + oneapi::dpl::__par_backend_hetero::__parallel_for( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>( + std::forward<_ExecutionPolicy>(__exec)), + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>( + __binary_op, __intermediate_result_end), + __result_end, + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end), + oneapi::dpl::__ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values)) + .__deferrable_wait(); + return __result_end; +} + template oneapi::dpl::__internal::__difference_t<_Range3> @@ -2335,120 +2455,11 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe } } #endif - if constexpr (oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>::value) - { - return __parallel_reduce_by_segment_known_identity( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), - std::forward<_Range4>(__out_values), __binary_pred, __binary_op); - } - else - { - // Round 1: reduce with extra indices added to avoid long segments - // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. - // Skip a round of copy_if and reduces if there are none. - auto __idx = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); - auto __tmp_out_keys = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer(); - auto __tmp_out_values = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer(); - - // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences, - // dropping the last key for the i-1 sequence. - auto __k1 = - oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n); - - // view1 elements are a tuple of the element index and pairs of adjacent keys - // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if - auto __view1 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys); - auto __view2 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__tmp_out_keys), - oneapi::dpl::__ranges::views::all_write(__idx)); - - // use work group size adjusted to shared local memory as the maximum segment size. - std::size_t __wgroup_size = - oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type)); - - // element is copied if it is the 0th element (marks beginning of first segment), is in an index - // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the - // adjacent element (marks end of real segments) - // TODO: replace wgroup size with segment size based on platform specifics. - auto __intermediate_result_end = - oneapi::dpl::__par_backend_hetero::__parallel_copy_if( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2, - __n, - [__binary_pred, __wgroup_size](const auto& __a) { - // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys - // for (i-1), but we still need to get its key value as it is the start of a segment - const auto index = std::get<0>(__a); - if (index == 0) - return true; - return index % __wgroup_size == 0 // segment size - || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison - }, - unseq_backend::__brick_assign_key_position{}) - .get(); - - //reduce by segment - oneapi::dpl::__par_backend_hetero::__parallel_for( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), - __intermediate_result_end, - oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), - __intermediate_result_end), - std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values)) - .wait(); - - // Round 2: final reduction to get result for each segment of equal adjacent keys - // create views over adjacent keys - oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys( - __tmp_out_keys); - - // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key, - // dropping the last key for the i-1 sequence. Only taking the appropriate number of keys to start with here. - auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end); - - auto __k3 = oneapi::dpl::__ranges::take_view_simple( - oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end); - - // view3 elements are a tuple of the element index and pairs of adjacent keys - // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if - auto __view3 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), - __k3, __clipped_new_keys); - auto __view4 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__out_keys), - oneapi::dpl::__ranges::views::all_write(__idx)); - - // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to - // the adjacent element (end of a segment). Artificial segments based on wg size are not created. - auto __result_end = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), - __view3, __view4, __view3.size(), - [__binary_pred](const auto& __a) { - // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys - // for (i-1), but we still need to get its key value as it is the start of a segment - if (std::get<0>(__a) == 0) - return true; - return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison - }, - unseq_backend::__brick_assign_key_position{}) - .get(); - - //reduce by segment - oneapi::dpl::__par_backend_hetero::__parallel_for( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>( - std::forward<_ExecutionPolicy>(__exec)), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>( - __binary_op, __intermediate_result_end), - __result_end, - oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end), - oneapi::dpl::__ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values)) - .__deferrable_wait(); - return __result_end; - } + return __parallel_reduce_by_segment_fallback( + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op, + oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>{}); } } // namespace __par_backend_hetero diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h index 14860e3830a..f3455d45779 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h @@ -89,9 +89,10 @@ using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; template oneapi::dpl::__internal::__difference_t<_Range3> -__parallel_reduce_by_segment_known_identity(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, + _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op, + /*known_identity=*/std::true_type) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; From b5b266db0d0fad373bf99d0254bd7ea67cb3add1 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 13 Nov 2024 14:50:25 -0600 Subject: [PATCH 27/34] clang-format Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 6 ++---- .../parallel_backend_sycl_reduce_by_segment.h | 21 +++++++++---------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index eadc79125be..afd14bd3ea9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -2314,8 +2314,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ // Round 1: reduce with extra indices added to avoid long segments // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. // Skip a round of copy_if and reduces if there are none. - auto __idx = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); + auto __idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); auto __tmp_out_keys = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer(); auto __tmp_out_values = @@ -2361,8 +2360,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ oneapi::dpl::__par_backend_hetero::__parallel_for( oneapi::dpl::__internal::__device_backend_tag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), - __intermediate_result_end, + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end, oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __intermediate_result_end), std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values)) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h index f3455d45779..bc702137963 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h @@ -86,12 +86,13 @@ template using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; } // namespace -template +template oneapi::dpl::__internal::__difference_t<_Range3> -__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op, +__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, + _Range4&& __out_values, _BinaryPredicate __binary_pred, + _BinaryOperator __binary_op, /*known_identity=*/std::true_type) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; @@ -137,11 +138,10 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ auto __seg_reduce_prefix_kernel = __par_backend_hetero::__internal::__kernel_compiler<_SegReducePrefixKernel>::__compile(__exec); __wgroup_size = - std::min({__wgroup_size, - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_count_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_offset_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_wg_kernel), - oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_prefix_kernel)}); + std::min({__wgroup_size, oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_count_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_offset_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_wg_kernel), + oneapi::dpl::__internal::__kernel_work_group_size(__exec, __seg_reduce_prefix_kernel)}); #endif std::size_t __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n, __wgroup_size * __vals_per_item); @@ -462,4 +462,3 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ } // namespace oneapi #endif - From 137416ecbfbeea83453b4dfc5fe78ca668aa9c55 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Thu, 21 Nov 2024 16:40:45 -0600 Subject: [PATCH 28/34] Address comments in reduce-then-scan based implementation Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 12 ++++++++++-- .../implementation_details/device_copyable.pass.cpp | 6 +++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index afd14bd3ea9..87e3773641d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -804,6 +804,9 @@ struct __gen_red_by_seg_reduce_input const auto __in_keys = std::get<0>(__in_rng.tuple()); const auto __in_vals = std::get<1>(__in_rng.tuple()); using _ValueType = oneapi::dpl::__internal::__value_t; + // The first segment start (index 0) is not marked with a 1. This is because we need the first + // segment's key and value output index to be 0. We begin marking new segments only after the + // first. const std::size_t __new_seg_mask = __id > 0 && !__binary_pred(__in_keys[__id - 1], __in_keys[__id]); return oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}); } @@ -847,7 +850,7 @@ struct __gen_red_by_seg_scan_input oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, __current_key, __current_key); // Passing __current_key as the next key for the last element is a placeholder } - else + else // __id == 0 { const _KeyType& __next_key = __in_keys[__id + 1]; return oneapi::dpl::__internal::make_tuple( @@ -869,7 +872,7 @@ struct __red_by_seg_op { using std::get; // The left-hand side has processed elements from the same segment, so update the reduction value. - if (std::get<0>(__rhs_tup) == 0) + if (get<0>(__rhs_tup) == 0) { return oneapi::dpl::__internal::make_tuple(get<0>(__lhs_tup), __binary_op(get<1>(__lhs_tup), get<1>(__rhs_tup))); @@ -899,6 +902,11 @@ struct __write_red_by_seg const bool __is_seg_end = get<1>(__tup); const std::size_t __out_idx = get<0>(get<0>(__tup)); + // With the exception of the first key which is output by index 0, the first key in each segment is written + // by the work item that outputs the previous segment's reduction value. This is because the reduce_by_segment + // API requires that the first key in a segment is output and is important for when keys in a segment might not + // be the same (but satisfy the predicate). The last segment does not output a key as there are no future + // segments process. if (__id == 0) __out_keys[0] = __current_key; if (__is_seg_end) diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index 97707a50119..8bcdc9e687c 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -158,9 +158,9 @@ test_device_copyable() "__gen_transform_input is not device copyable with device copyable types"); //__gen_red_by_seg_reduce_input - static_assert( - sycl::is_device_copyable_v>, - "__gen_red_by_seg_reduce_input is not device copyable with device copyable types"); + static_assert(sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input>, + "__gen_red_by_seg_reduce_input is not device copyable with device copyable types"); //__gen_red_by_seg_scan_input static_assert( From 7f9bacdd9955f0d6cb3621c5d1beba60f5ab7d04 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 22 Nov 2024 15:36:52 -0600 Subject: [PATCH 29/34] Improve explanations of reduce-by-segment approach Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 87e3773641d..2e1549abdd9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -866,6 +866,51 @@ struct __gen_red_by_seg_scan_input template struct __red_by_seg_op { + // Consider the following segment / value pairs that would be processed in reduce-then-scan by a sub-group of size 8: + // ---------------------------------------------------------- + // Keys: 0 0 1 1 2 2 2 2 + // Values: 1 1 1 1 1 1 1 1 + // ---------------------------------------------------------- + // The reduce and scan input generation phase flags new segments (excluding index 0) for use in the sub-group scan + // operation. The above key, value pairs correspond to the following flag, value pairs: + // ---------------------------------------------------------- + // Flags: 0 0 1 0 1 0 0 0 + // Values: 1 1 1 1 1 1 1 1 + // ---------------------------------------------------------- + // The sub-group scan operation looks back by powers-of-2 applying encountered prefixes. The __red_by_seg_op + // operation performs a standard inclusive scan over the flags to compute output indices while performing a masked + // scan over values to avoid applying a previous segment's partial reduction. Previous value elements are reduced + // so long as the current index's flag is 0, indicating that input within its segment is still being processed + // ---------------------------------------------------------- + // Start: + // ---------------------------------------------------------- + // Flags: 0 0 1 0 1 0 0 0 + // Values: 1 1 1 1 1 1 1 1 + // ---------------------------------------------------------- + // After step 1 (apply the i-1th value if the ith flag is 0): + // ---------------------------------------------------------- + // Flags: 0 0 1 1 1 1 0 0 + // Values: 1 2 1 2 1 2 2 2 + // ---------------------------------------------------------- + // After step 2 (apply the i-2th value if the ith flag is 0): + // ---------------------------------------------------------- + // Flags: 0 0 1 1 2 2 1 1 + // Values: 1 2 1 2 1 2 3 4 + // ---------------------------------------------------------- + // After step 3 (apply the i-4th value if the ith flag is 0): + // ---------------------------------------------------------- + // Flags: 0 0 1 1 2 2 2 2 + // Values: 1 2 1 2 1 2 3 4 + // ^ ^ ^ + // ---------------------------------------------------------- + // Note that the scan of segment flags results in the desired output index of the reduce_by_segment operation in + // each segment and the item corresponding to the final key in a segment contains its output reduction value. This + // operation is first applied within a sub-group and then across sub-groups, work-groups, and blocks to + // reduce-by-segment across the full input. The result of these operations combined with cached key data in + // __gen_red_by_seg_scan_input enables the write phase to output keys and reduction values. + // => + // Segments : 0 1 2 + // Values : 2 2 4 template auto operator()(const _Tup1& __lhs_tup, const _Tup2& __rhs_tup) const @@ -1328,10 +1373,15 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { + // Flags new segments and passes input value through a 2-tuple using _GenReduceInput = __gen_red_by_seg_reduce_input<_BinaryPredicate>; + // Operation that computes output indices and output reduction values per segment using _ReduceOp = __red_by_seg_op<_BinaryOperator>; + // Returns 4-component tuple which contains flags, keys, value, and a flag to write output using _GenScanInput = __gen_red_by_seg_scan_input<_BinaryPredicate>; + // Returns the first component from scan input which is scanned over using _ScanInputTransform = __get_zeroth_element; + // Writes current segment's output reduction and the next segment's output key using _WriteOp = __write_red_by_seg<_BinaryPredicate>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; std::size_t __n = __keys.size(); From bfdc57e4a584d0bd34dd7478abc8e75b870cbecd Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 22 Nov 2024 15:54:08 -0600 Subject: [PATCH 30/34] Use binary_op[_non]_device_copyable where appropriate Signed-off-by: Matthew Michel --- .../device_copyable.pass.cpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/test/general/implementation_details/device_copyable.pass.cpp b/test/general/implementation_details/device_copyable.pass.cpp index 8bcdc9e687c..322d93d4824 100644 --- a/test/general/implementation_details/device_copyable.pass.cpp +++ b/test/general/implementation_details/device_copyable.pass.cpp @@ -163,9 +163,9 @@ test_device_copyable() "__gen_red_by_seg_reduce_input is not device copyable with device copyable types"); //__gen_red_by_seg_scan_input - static_assert( - sycl::is_device_copyable_v>, - "__gen_red_by_seg_scan_input is not device copyable with device copyable types"); + static_assert(sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input>, + "__gen_red_by_seg_scan_input is not device copyable with device copyable types"); //__gen_mask static_assert(sycl::is_device_copyable_v>, @@ -198,7 +198,7 @@ test_device_copyable() //__write_red_by_seg static_assert( - sycl::is_device_copyable_v>, + sycl::is_device_copyable_v>, "__write_red_by_seg is not device copyable with device copyable types"); // __early_exit_find_or @@ -421,14 +421,14 @@ test_non_device_copyable() "__gen_transform_input is device copyable with non device copyable types"); //__gen_red_by_seg_reduce_input - static_assert( - !sycl::is_device_copyable_v>, - "__gen_red_by_seg_reduce_input is device copyable with non device copyable types"); + static_assert(!sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_reduce_input>, + "__gen_red_by_seg_reduce_input is device copyable with non device copyable types"); //__gen_red_by_seg_reduce_input - static_assert( - !sycl::is_device_copyable_v>, - "__gen_red_by_seg_scan_input is device copyable with non device copyable types"); + static_assert(!sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__gen_red_by_seg_scan_input>, + "__gen_red_by_seg_scan_input is device copyable with non device copyable types"); //__gen_mask static_assert(!sycl::is_device_copyable_v>, @@ -460,9 +460,9 @@ test_non_device_copyable() "__write_to_id_if_else is device copyable with non device copyable types"); //__write_red_by_seg - static_assert( - !sycl::is_device_copyable_v>, - "__write_red_by_seg is device copyable with non device copyable types"); + static_assert(!sycl::is_device_copyable_v< + oneapi::dpl::__par_backend_hetero::__write_red_by_seg>, + "__write_red_by_seg is device copyable with non device copyable types"); // __early_exit_find_or static_assert( From eb5889904c0f5a4a377edc5921b7c81a01bb2413 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 22 Nov 2024 16:57:30 -0600 Subject: [PATCH 31/34] Address comments in fallback implementation Signed-off-by: Matthew Michel --- .../dpcpp/parallel_backend_sycl_reduce_by_segment.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h index bc702137963..62ae736782d 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h @@ -175,7 +175,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ sycl::nd_item<1> __item) { auto __group = __item.get_group(); std::size_t __group_id = __item.get_group(0); - std::size_t __local_id = __item.get_local_id(0); + std::uint32_t __local_id = __item.get_local_id(0); std::size_t __global_id = __item.get_global_id(0); std::size_t __start = __global_id * __vals_per_item; @@ -267,12 +267,12 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ // 2c. Count the number of prior work segments cooperatively over group std::size_t __prior_segs_in_wg = __dpl_sycl::__exclusive_scan_over_group( - __group, __item_segments, __dpl_sycl::__plus()); + __group, __item_segments, __dpl_sycl::__plus()); std::size_t __start_idx = __wg_num_prior_segs + __prior_segs_in_wg; // 2d. Find the greatest segment end less than the current index (inclusive) std::size_t __closest_seg_id = __dpl_sycl::__inclusive_scan_over_group( - __group, __max_end, __dpl_sycl::__maximum()); + __group, __max_end, __dpl_sycl::__maximum()); // __wg_segmented_scan is a derivative work and responsible for the third header copyright __val_type __carry_in = oneapi::dpl::internal::__wg_segmented_scan( @@ -287,7 +287,7 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ if (__local_id == 0) { __apply_aggs = false; - if (__global_id == 0 && __n > 0) + if (__global_id == 0) { // first segment identifier is always the first key __out_keys[0] = __keys[0]; From 45b6975616a9e2d82dade638c8cc4ca70b7f718a Mon Sep 17 00:00:00 2001 From: Matthew Michel <106704043+mmichel11@users.noreply.github.com> Date: Fri, 6 Dec 2024 13:27:59 -0600 Subject: [PATCH 32/34] Adjust comment to improve clarity Co-authored-by: Adam Fidel <110841220+adamfidel@users.noreply.github.com> --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 2e1549abdd9..f204054a1bc 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -2338,11 +2338,11 @@ __parallel_partial_sort(oneapi::dpl::__internal::__device_backend_tag __backend_ //------------------------------------------------------------------------ // reduce_by_segment - sync pattern -//------------------------------------------------------------------------ - +// // TODO: The non-identity fallback path of reduce-by-segment must currently be implemented synchronously due to the // inability to create event dependency chains across separate parallel pattern calls. If we ever add support for // cross parallel pattern dependencies, then we can implement this as an async pattern. +//------------------------------------------------------------------------ template struct __reduce1_wrapper; From aa1ac312abc48584f2c525f7c476982234f0e832 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 6 Dec 2024 12:07:40 -0800 Subject: [PATCH 33/34] Address review comment Signed-off-by: Matthew Michel --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index f204054a1bc..96d63e33aee 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -832,6 +832,7 @@ struct __gen_red_by_seg_scan_input using _KeyType = oneapi::dpl::__internal::__value_t; using _ValueType = oneapi::dpl::__internal::__value_t; const _KeyType& __current_key = __in_keys[__id]; + const _ValueType& __current_val = __in_vals[__id]; // Ordering the most common condition first has yielded the best results. if (__id > 0 && __id < __n - 1) { @@ -839,7 +840,7 @@ struct __gen_red_by_seg_scan_input const _KeyType& __next_key = __in_keys[__id + 1]; const std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); return oneapi::dpl::__internal::make_tuple( - oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), + oneapi::dpl::__internal::make_tuple(__new_seg_mask, __current_val), !__binary_pred(__current_key, __next_key), __next_key, __current_key); } else if (__id == __n - 1) @@ -847,14 +848,14 @@ struct __gen_red_by_seg_scan_input const _KeyType& __prev_key = __in_keys[__id - 1]; const std::size_t __new_seg_mask = !__binary_pred(__prev_key, __current_key); return oneapi::dpl::__internal::make_tuple( - oneapi::dpl::__internal::make_tuple(__new_seg_mask, _ValueType{__in_vals[__id]}), true, __current_key, + oneapi::dpl::__internal::make_tuple(__new_seg_mask, __current_val), true, __current_key, __current_key); // Passing __current_key as the next key for the last element is a placeholder } else // __id == 0 { const _KeyType& __next_key = __in_keys[__id + 1]; return oneapi::dpl::__internal::make_tuple( - oneapi::dpl::__internal::make_tuple(std::size_t{0}, _ValueType{__in_vals[__id]}), + oneapi::dpl::__internal::make_tuple(std::size_t{0}, __current_val), !__binary_pred(__current_key, __next_key), __next_key, __current_key); } } From b2907476b2721a48cee8393ddeac933a972b0a65 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 10 Dec 2024 08:54:13 -0600 Subject: [PATCH 34/34] Use full namespace: oneapi::dpl::__ranges::__get_sycl_range to fix compilation issues Signed-off-by: Matthew Michel --- include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h index d853f057561..65bf99c8777 100644 --- a/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h +++ b/include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h @@ -2030,13 +2030,15 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& return 1; } - auto __keep_keys = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); + auto __keep_keys = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>(); auto __keys = __keep_keys(__keys_first, __keys_last); - auto __keep_values = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator2>(); + auto __keep_values = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator2>(); auto __values = __keep_values(__values_first, __values_first + __n); - auto __keep_key_outputs = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator3>(); + auto __keep_key_outputs = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator3>(); auto __out_keys = __keep_key_outputs(__out_keys_first, __out_keys_first + __n); - auto __keep_value_outputs = __ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator4>(); + auto __keep_value_outputs = + oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator4>(); auto __out_values = __keep_value_outputs(__out_values_first, __out_values_first + __n); return oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment( _BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __keys.all_view(), __values.all_view(),