Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace SYCL backend reduce_by_segment implementation with reduce-then-scan call #1915

Merged
merged 34 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
217d041
Initial commit of reduce_by_segment with the reduce-then-scan path
mmichel11 Sep 27, 2024
7a64f8f
Revert change to ranges and use zip_view over segments / values instead
mmichel11 Oct 14, 2024
ce1d495
Implement correct return for reduce_by_segment
mmichel11 Oct 15, 2024
f36b0a0
Add support for flag predicates
mmichel11 Oct 15, 2024
c8d667c
Revert "Add support for flag predicates"
mmichel11 Oct 15, 2024
0078cbd
Re-implement support for flag predicates in a more performant manner
mmichel11 Oct 15, 2024
5c2bb40
Add fallback and remove old SYCL implementation
mmichel11 Oct 15, 2024
f5f861e
Switch from using lambdas to functors
mmichel11 Oct 15, 2024
364a889
Add device copyable specializations for red-by-seg functors and updat…
mmichel11 Oct 15, 2024
86cf9eb
Fix typo in error message in device_copyable.pass.cpp
mmichel11 Oct 15, 2024
4f5ff63
Introduce separate input generation for scan phase and update tests
mmichel11 Oct 18, 2024
6b2b328
Improve code readability
mmichel11 Oct 18, 2024
b0d04e5
Add optional first key field to scan input and remove input range in …
mmichel11 Oct 18, 2024
af44532
Update __write_op in reduce-then-scan
mmichel11 Oct 18, 2024
9066f80
Remove now unneeded ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION macro
mmichel11 Oct 18, 2024
c5ff176
Alternate testing between usm shared and device to prevent excessive …
mmichel11 Oct 21, 2024
57dc125
Performance tuning within scan input functor
mmichel11 Oct 21, 2024
3e2e7e7
Handle n=0, n=1 first in reduce_by_segment
mmichel11 Oct 21, 2024
c5efc99
Code cleanup
mmichel11 Oct 22, 2024
4f2432f
Improve comments and mark relevant variables as const
mmichel11 Oct 22, 2024
9864ab1
Add condition to ensure value type is trivially copyable to call redu…
mmichel11 Oct 22, 2024
e620a69
clang-format
mmichel11 Oct 22, 2024
cf3a243
Introduce iterator based __pattern_reduce_by_segment
mmichel11 Nov 5, 2024
bea3d38
Revert "Remove now unneeded ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTIO…
mmichel11 Nov 5, 2024
70723ea
Fix test bug where device allocation is always used for testing
mmichel11 Nov 5, 2024
76deb6f
Separate each reduce_by_segment fallback path into their own functions
mmichel11 Nov 13, 2024
b5b266d
clang-format
mmichel11 Nov 13, 2024
137416e
Address comments in reduce-then-scan based implementation
mmichel11 Nov 21, 2024
7f9bacd
Improve explanations of reduce-by-segment approach
mmichel11 Nov 22, 2024
bfdc57e
Use binary_op[_non]_device_copyable where appropriate
mmichel11 Nov 22, 2024
eb58899
Address comments in fallback implementation
mmichel11 Nov 22, 2024
45b6975
Adjust comment to improve clarity
mmichel11 Dec 6, 2024
aa1ac31
Address review comment
mmichel11 Dec 6, 2024
b290747
Use full namespace: oneapi::dpl::__ranges::__get_sycl_range to fix co…
mmichel11 Dec 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
436 changes: 8 additions & 428 deletions include/oneapi/dpl/internal/reduce_by_segment_impl.h

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions include/oneapi/dpl/pstl/hetero/algorithm_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -2003,6 +2003,48 @@ __pattern_shift_right(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec
return __last - __res;
}

template <typename _Name>
struct __copy_keys_values_wrapper;

template <typename _BackendTag, typename _ExecutionPolicy, typename _Iterator1, typename _Iterator2,
typename _Iterator3, typename _Iterator4, typename _BinaryPredicate, typename _BinaryOperator>
typename std::iterator_traits<_Iterator3>::difference_type
__pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&& __exec, _Iterator1 __keys_first,
_Iterator1 __keys_last, _Iterator2 __values_first, _Iterator3 __out_keys_first,
_Iterator4 __out_values_first, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op)
{
std::size_t __n = std::distance(__keys_first, __keys_last);

if (__n == 0)
return 0;

if (__n == 1)
{
__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy> __copy_op{};

oneapi::dpl::__internal::__pattern_walk2_n(
__tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_values_wrapper>(__exec),
oneapi::dpl::make_zip_iterator(__keys_first, __values_first), 1,
oneapi::dpl::make_zip_iterator(__out_keys_first, __out_values_first), __copy_op);

return 1;
}

auto __keep_keys = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator1>();
auto __keys = __keep_keys(__keys_first, __keys_last);
auto __keep_values = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _Iterator2>();
auto __values = __keep_values(__values_first, __values_first + __n);
auto __keep_key_outputs =
oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator3>();
auto __out_keys = __keep_key_outputs(__out_keys_first, __out_keys_first + __n);
auto __keep_value_outputs =
oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator4>();
auto __out_values = __keep_value_outputs(__out_values_first, __out_values_first + __n);
return oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), __keys.all_view(), __values.all_view(),
__out_keys.all_view(), __out_values.all_view(), __binary_pred, __binary_op);
}

} // namespace __internal
} // namespace dpl
} // namespace oneapi
Expand Down
132 changes: 9 additions & 123 deletions include/oneapi/dpl/pstl/hetero/algorithm_ranges_impl_hetero.h
Original file line number Diff line number Diff line change
Expand Up @@ -889,22 +889,7 @@ __pattern_minmax_element(__hetero_tag<_BackendTag>, _ExecutionPolicy&& __exec, _
//------------------------------------------------------------------------

template <typename _Name>
struct __copy_keys_wrapper;

template <typename _Name>
struct __copy_values_wrapper;

template <typename _Name>
struct __reduce1_wrapper;

template <typename _Name>
struct __reduce2_wrapper;

template <typename _Name>
struct __assign_key1_wrapper;

template <typename _Name>
struct __assign_key2_wrapper;
struct __copy_keys_values_range_wrapper;

template <typename _BackendTag, typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3,
typename _Range4, typename _BinaryPredicate, typename _BinaryOperator>
Expand Down Expand Up @@ -932,117 +917,18 @@ __pattern_reduce_by_segment(__hetero_tag<_BackendTag> __tag, _ExecutionPolicy&&
__brick_copy<__hetero_tag<_BackendTag>, _ExecutionPolicy> __copy_range{};

oneapi::dpl::__internal::__ranges::__pattern_walk_n(
__tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_wrapper>(__exec), __copy_range,
std::forward<_Range1>(__keys), std::forward<_Range3>(__out_keys));

oneapi::dpl::__internal::__ranges::__pattern_walk_n(
__tag,
oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_values_wrapper>(
std::forward<_ExecutionPolicy>(__exec)),
__copy_range, std::forward<_Range2>(__values), std::forward<_Range4>(__out_values));
__tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__copy_keys_values_range_wrapper>(__exec),
__copy_range,
oneapi::dpl::__ranges::zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
oneapi::dpl::__ranges::zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)));

return 1;
}

using __diff_type = oneapi::dpl::__internal::__difference_t<_Range1>;
using __key_type = oneapi::dpl::__internal::__value_t<_Range1>;
using __val_type = oneapi::dpl::__internal::__value_t<_Range2>;

// Round 1: reduce with extra indices added to avoid long segments
// TODO: At threshold points check if the key is equal to the key at the previous threshold point, indicating a long sequence.
// Skip a round of copy_if and reduces if there are none.
auto __idx = oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __diff_type>(__exec, __n).get_buffer();
auto __tmp_out_keys =
oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __key_type>(__exec, __n).get_buffer();
auto __tmp_out_values =
oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, __val_type>(__exec, __n).get_buffer();

// Replicating first element of keys view to be able to compare (i-1)-th and (i)-th key with aligned sequences,
// dropping the last key for the i-1 sequence.
auto __k1 =
oneapi::dpl::__ranges::take_view_simple(oneapi::dpl::__ranges::replicate_start_view_simple(__keys, 1), __n);

// view1 elements are a tuple of the element index and pairs of adjacent keys
// view2 elements are a tuple of the elements where key-index pairs will be written by copy_if
auto __view1 = experimental::ranges::zip_view(experimental::ranges::views::iota(0, __n), __k1, __keys);
auto __view2 = experimental::ranges::zip_view(experimental::ranges::views::all_write(__tmp_out_keys),
experimental::ranges::views::all_write(__idx));

// use work group size adjusted to shared local memory as the maximum segment size.
std::size_t __wgroup_size =
oneapi::dpl::__internal::__slm_adjusted_work_group_size(__exec, sizeof(__key_type) + sizeof(__val_type));

// element is copied if it is the 0th element (marks beginning of first segment), is in an index
// evenly divisible by wg size (ensures segments are not long), or has a key not equal to the
// adjacent element (marks end of real segments)
// TODO: replace wgroup size with segment size based on platform specifics.
auto __intermediate_result_end = __ranges::__pattern_copy_if(
__tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key1_wrapper>(__exec), __view1, __view2,
[__binary_pred, __wgroup_size](const auto& __a) {
// The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
// for (i-1), but we still need to get its key value as it is the start of a segment
const auto index = std::get<0>(__a);
if (index == 0)
return true;
return index % __wgroup_size == 0 // segment size
|| !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // key comparison
},
unseq_backend::__brick_assign_key_position{});

//reduce by segment
oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{}, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce1_wrapper>(__exec),
unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__n)>(__binary_op, __n), __intermediate_result_end,
oneapi::dpl::__ranges::take_view_simple(experimental::ranges::views::all_read(__idx),
__intermediate_result_end),
std::forward<_Range2>(__values), experimental::ranges::views::all_write(__tmp_out_values))
.wait();

// Round 2: final reduction to get result for each segment of equal adjacent keys
// create views over adjacent keys
oneapi::dpl::__ranges::all_view<__key_type, __par_backend_hetero::access_mode::read_write> __new_keys(
__tmp_out_keys);

// Replicating first element of key views to be able to compare (i-1)-th and (i)-th key,
// dropping the last key for the i-1 sequence. Only taking the appropriate number of keys to start with here.
auto __clipped_new_keys = oneapi::dpl::__ranges::take_view_simple(__new_keys, __intermediate_result_end);

auto __k3 = oneapi::dpl::__ranges::take_view_simple(
oneapi::dpl::__ranges::replicate_start_view_simple(__clipped_new_keys, 1), __intermediate_result_end);

// view3 elements are a tuple of the element index and pairs of adjacent keys
// view4 elements are a tuple of the elements where key-index pairs will be written by copy_if
auto __view3 = experimental::ranges::zip_view(experimental::ranges::views::iota(0, __intermediate_result_end), __k3,
__clipped_new_keys);
auto __view4 = experimental::ranges::zip_view(experimental::ranges::views::all_write(__out_keys),
experimental::ranges::views::all_write(__idx));

// element is copied if it is the 0th element (marks beginning of first segment), or has a key not equal to
// the adjacent element (end of a segment). Artificial segments based on wg size are not created.
auto __result_end = __ranges::__pattern_copy_if(
__tag, oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__assign_key2_wrapper>(__exec), __view3, __view4,
[__binary_pred](const auto& __a) {
// The size of key range for the (i-1) view is one less, so for the 0th index we do not check the keys
// for (i-1), but we still need to get its key value as it is the start of a segment
if (std::get<0>(__a) == 0)
return true;
return !__binary_pred(std::get<1>(__a), std::get<2>(__a)); // keys comparison
},
unseq_backend::__brick_assign_key_position{});

//reduce by segment
oneapi::dpl::__par_backend_hetero::__parallel_for(
_BackendTag{},
oneapi::dpl::__par_backend_hetero::make_wrapped_policy<__reduce2_wrapper>(
std::forward<_ExecutionPolicy>(__exec)),
unseq_backend::__brick_reduce_idx<_BinaryOperator, decltype(__intermediate_result_end)>(
__binary_op, __intermediate_result_end),
__result_end,
oneapi::dpl::__ranges::take_view_simple(experimental::ranges::views::all_read(__idx), __result_end),
experimental::ranges::views::all_read(__tmp_out_values), std::forward<_Range4>(__out_values))
.__deferrable_wait();

return __result_end;
return oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment(
_BackendTag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys),
std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values),
__binary_pred, __binary_op);
}

} // namespace __ranges
Expand Down
Loading
Loading