From 5987df9c1c2f469a265cbf884b85c836d87d1e1e Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 13 Nov 2024 13:45:21 -0600 Subject: [PATCH] Separate each reduce_by_segment fallback path into their own functions Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 239 +++++++++--------- .../parallel_backend_sycl_reduce_by_segment.h | 7 +- 2 files changed, 129 insertions(+), 117 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 431d58d75ea..61975975363 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -2295,6 +2295,126 @@ struct __assign_key1_wrapper; template struct __assign_key2_wrapper; +template +oneapi::dpl::__internal::__difference_t<_Range3> +__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, + _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, + _Range4&& __out_values, _BinaryPredicate __binary_pred, + _BinaryOperator __binary_op, + /*known_identity=*/std::false_type) +{ + using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>; + using __key_type = oneapi::dpl::__internal::__value_t<_Range1>; + using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; + + const auto __n = __keys.size(); + // Round 1: reduce with extra indices added to avoid long segments + // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. + // Skip a round of copy_if and reduces if there are none. + auto __idx = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); + auto __tmp_out_keys = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer(); + auto __tmp_out_values = + oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer(); + + // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences, + // dropping the last key for the i-1 sequence. + auto __k1 = + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n); + + // view1 elements are a tuple of the element index and pairs of adjacent keys + // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if + auto __view1 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys); + auto __view2 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__tmp_out_keys), + oneapi::dpl::__ranges::views::all_write(__idx)); + + // use work group size adjusted to shared local memory as the maximum segment size. + std::size_t __wgroup_size = + oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type)); + + // element is copied if it is the 0th element (marks beginning of first segment), is in an index + // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the + // adjacent element (marks end of real segments) + // TODO: replace wgroup size with segment size based on platform specifics. + auto __intermediate_result_end = + oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2, + __n, + [__binary_pred, __wgroup_size](const auto& __a) { + // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys + // for (i-1), but we still need to get its key value as it is the start of a segment + const auto index = std::get<0>(__a); + if (index == 0) + return true; + return index % __wgroup_size == 0 // segment size + || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison + }, + unseq_backend::__brick_assign_key_position{}) + .get(); + + //reduce by segment + oneapi::dpl::__par_backend_hetero::__parallel_for( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), + __intermediate_result_end, + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), + __intermediate_result_end), + std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values)) + .wait(); + + // Round 2: final reduction to get result for each segment of equal adjacent keys + // create views over adjacent keys + oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys( + __tmp_out_keys); + + // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key, + // dropping the last key for the i-1 sequence. Only taking the appropriate number of keys to start with here. + auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end); + + auto __k3 = oneapi::dpl::__ranges::take_view_simple( + oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end); + + // view3 elements are a tuple of the element index and pairs of adjacent keys + // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if + auto __view3 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), + __k3, __clipped_new_keys); + auto __view4 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__out_keys), + oneapi::dpl::__ranges::views::all_write(__idx)); + + // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to + // the adjacent element (end of a segment). Artificial segments based on wg size are not created. + auto __result_end = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), + __view3, __view4, __view3.size(), + [__binary_pred](const auto& __a) { + // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys + // for (i-1), but we still need to get its key value as it is the start of a segment + if (std::get<0>(__a) == 0) + return true; + return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison + }, + unseq_backend::__brick_assign_key_position{}) + .get(); + + //reduce by segment + oneapi::dpl::__par_backend_hetero::__parallel_for( + oneapi::dpl::__internal::__device_backend_tag{}, + oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>( + std::forward<_ExecutionPolicy>(__exec)), + unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>( + __binary_op, __intermediate_result_end), + __result_end, + oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end), + oneapi::dpl::__ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values)) + .__deferrable_wait(); + return __result_end; +} + template oneapi::dpl::__internal::__difference_t<_Range3> @@ -2333,120 +2453,11 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe } } #endif - if constexpr (oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>::value) - { - return __parallel_reduce_by_segment_known_identity( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), - std::forward<_Range4>(__out_values), __binary_pred, __binary_op); - } - else - { - // Round 1: reduce with extra indices added to avoid long segments - // TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence. - // Skip a round of copy_if and reduces if there are none. - auto __idx = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer(); - auto __tmp_out_keys = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer(); - auto __tmp_out_values = - oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer(); - - // Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences, - // dropping the last key for the i-1 sequence. - auto __k1 = - oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n); - - // view1 elements are a tuple of the element index and pairs of adjacent keys - // view2 elements are a tuple of the elements where key-index pairs will be written by copy_if - auto __view1 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys); - auto __view2 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__tmp_out_keys), - oneapi::dpl::__ranges::views::all_write(__idx)); - - // use work group size adjusted to shared local memory as the maximum segment size. - std::size_t __wgroup_size = - oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type)); - - // element is copied if it is the 0th element (marks beginning of first segment), is in an index - // evenly divisible by wg size (ensures segments are not long), or has a key not equal to the - // adjacent element (marks end of real segments) - // TODO: replace wgroup size with segment size based on platform specifics. - auto __intermediate_result_end = - oneapi::dpl::__par_backend_hetero::__parallel_copy_if( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2, - __n, - [__binary_pred, __wgroup_size](const auto& __a) { - // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys - // for (i-1), but we still need to get its key value as it is the start of a segment - const auto index = std::get<0>(__a); - if (index == 0) - return true; - return index % __wgroup_size == 0 // segment size - || !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison - }, - unseq_backend::__brick_assign_key_position{}) - .get(); - - //reduce by segment - oneapi::dpl::__par_backend_hetero::__parallel_for( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), - __intermediate_result_end, - oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), - __intermediate_result_end), - std::forward<_Range2>(__values), oneapi::dpl::__ranges::views::all_write(__tmp_out_values)) - .wait(); - - // Round 2: final reduction to get result for each segment of equal adjacent keys - // create views over adjacent keys - oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys( - __tmp_out_keys); - - // Replicating first element of key views to be able to compare (i-1)-th and (i)-th key, - // dropping the last key for the i-1 sequence. Only taking the appropriate number of keys to start with here. - auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end); - - auto __k3 = oneapi::dpl::__ranges::take_view_simple( - oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end); - - // view3 elements are a tuple of the element index and pairs of adjacent keys - // view4 elements are a tuple of the elements where key-index pairs will be written by copy_if - auto __view3 = oneapi::dpl::__ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), - __k3, __clipped_new_keys); - auto __view4 = oneapi::dpl::__ranges::zip_view(oneapi::dpl::__ranges::views::all_write(__out_keys), - oneapi::dpl::__ranges::views::all_write(__idx)); - - // element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to - // the adjacent element (end of a segment). Artificial segments based on wg size are not created. - auto __result_end = oneapi::dpl::__par_backend_hetero::__parallel_copy_if( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), - __view3, __view4, __view3.size(), - [__binary_pred](const auto& __a) { - // The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys - // for (i-1), but we still need to get its key value as it is the start of a segment - if (std::get<0>(__a) == 0) - return true; - return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison - }, - unseq_backend::__brick_assign_key_position{}) - .get(); - - //reduce by segment - oneapi::dpl::__par_backend_hetero::__parallel_for( - oneapi::dpl::__internal::__device_backend_tag{}, - oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>( - std::forward<_ExecutionPolicy>(__exec)), - unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>( - __binary_op, __intermediate_result_end), - __result_end, - oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::views::all_read(__idx), __result_end), - oneapi::dpl::__ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values)) - .__deferrable_wait(); - return __result_end; - } + return __parallel_reduce_by_segment_fallback( + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op, + oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>{}); } } // namespace __par_backend_hetero diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h index 14860e3830a..f3455d45779 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_by_segment.h @@ -89,9 +89,10 @@ using _SegReducePrefixPhase = __seg_reduce_prefix_kernel<_Name...>; template oneapi::dpl::__internal::__difference_t<_Range3> -__parallel_reduce_by_segment_known_identity(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, - _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) +__parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, + _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, + _BinaryPredicate __binary_pred, _BinaryOperator __binary_op, + /*known_identity=*/std::true_type) { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;