From 911d343e13000174c0d553f2fb0f476541cd5b60 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 3 Sep 2024 16:34:24 -0500 Subject: [PATCH 01/28] Add workaround for ignored reqd-sub-group-size attribute IGC intentionally forces a sub-group size of 16 on certain iGPUs to workaround a known issue. We have to determine this by first compiling the kernels to see if the required sub-group size is respected. Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 159 ++++++++++++------ .../parallel_backend_sycl_reduce_then_scan.h | 71 +++++--- 2 files changed, 147 insertions(+), 83 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d44d4da772e..6f46fe19640 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1090,6 +1090,20 @@ struct __write_to_id_if_else _Assign __assign; }; +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT +// Templated alias to easily reference reduce-then-scan-copy kernels. +template +using __reduce_then_scan_copy_kernels = + __reduce_then_scan_kernels<_ExecutionPolicy, _Range1, _Range2, + /*_GenReduceInput=*/oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>, + /*_ReduceOp=*/std::plus<_Size>, + /*_GenScanInput=*/oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>, + /*_ScanInputTransform=*/oneapi::dpl::__par_backend_hetero::__get_zeroth_element, + _WriteOp, oneapi::dpl::unseq_backend::__no_init_value<_Size>, + /*_Inclusive=*/std::true_type, _IsUniquePattern>; +#endif + template auto @@ -1122,19 +1136,29 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{}); } } +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT if (__use_reduce_then_scan) { using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>; using _ScanInputTransform = oneapi::dpl::__internal::__no_op; using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_id; + // Compile the kernels to check if the sub-group size is 32. This should not be necessary per the SYCL spec + // but is needed to check for an IGC workaround for a hardware bug where kernels may be compiled with a + // sub-group size of 16 despite requiring 32. If the wrong sub-group size is used, then fallback to + // multi-pass scan. _GenInput __gen_transform{__unary_op}; - - return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, - _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); + __reduce_then_scan_kernels<_ExecutionPolicy, _Range1, _Range2, decltype(__gen_transform), _BinaryOperation, decltype(__gen_transform), _ScanInputTransform, + _WriteOp, _InitType, _Inclusive, std::false_type> __kernels(__exec); + if (__kernels.__is_compiled_sg32()) + { + return __parallel_transform_reduce_then_scan( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, std::forward<_Range1>(__in_rng), + std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, + _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); + } } +#endif } //else use multi pass scan implementation @@ -1209,11 +1233,12 @@ struct __invoke_single_group_copy_if } }; -template auto __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask, + _Kernels& __kernels, _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask, _WriteOp __write_op, _IsUniquePattern __is_unique_pattern) { using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>; @@ -1222,11 +1247,12 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _ using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element; return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), _GenReduceInput{__generate_mask}, _ReduceOp{}, _GenScanInput{__generate_mask}, _ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, /*_Inclusive=*/std::true_type{}, __is_unique_pattern); } +#endif template @@ -1279,30 +1305,37 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t // can simply copy the input range to the output. assert(__n > 1); +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>; - - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, - _GenMask{__pred}, _WriteOp{_Assign{}}, - /*_IsUniquePattern=*/std::true_type{}); - } - else - { - - using _ReduceOp = std::plus; - using _CreateOp = - oneapi::dpl::__internal::__create_mask_unique_copy, - decltype(__n)>; - using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; - - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, - _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}, - _CopyOp{_ReduceOp{}, _Assign{}}); + // Compile the kernels to check if the sub-group size is 32. This should not be necessary per the SYCL spec + // but is needed to check for an IGC workaround for a hardware bug where kernels may be compiled with a + // sub-group size of 16 despite requiring 32. If the wrong sub-group size is used, then fallback to + // multi-pass scan. + __reduce_then_scan_copy_kernels<_ExecutionPolicy, _Range1, _Range2, _GenMask, + oneapi::dpl::__internal::__difference_t<_Range1>, _WriteOp, std::true_type> + __kernels(__exec); + if (__kernels.__is_compiled_sg32()) + { + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, + std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, + _GenMask{__pred}, _WriteOp{_Assign{}}, + /*_IsUniquePattern=*/std::true_type{}); + } } +#endif + using _ReduceOp = std::plus; + using _CreateOp = + oneapi::dpl::__internal::__create_mask_unique_copy, + decltype(__n)>; + using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; + + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, + _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}, + _CopyOp{_ReduceOp{}, _Assign{}}); } template __n = __rng.size(); +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if_else; - - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, - _GenMask{__pred}, _WriteOp{}, /*_IsUniquePattern=*/std::false_type{}); + // Compile the kernels to check if the sub-group size is 32. This should not be necessary per the SYCL spec + // but is needed to check for an IGC workaround for a hardware bug where kernels may be compiled with a + // sub-group size of 16 despite requiring 32. If the wrong sub-group size is used, then fallback to + // multi-pass scan. + __reduce_then_scan_copy_kernels<_ExecutionPolicy, _Range1, _Range2, _GenMask, + oneapi::dpl::__internal::__difference_t<_Range1>, _WriteOp, std::false_type> + __kernels(__exec); + if (__kernels.__is_compiled_sg32()) + { + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, + std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, + _GenMask{__pred}, _WriteOp{}, /*_IsUniquePattern=*/std::false_type{}); + } } - else - { - using _ReduceOp = std::plus; - using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>; - using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>; +#endif + using _ReduceOp = std::plus; + using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>; + using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>; - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); - } + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); } template (__exec), __n, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __pred, __assign); } +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>; - - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, - _GenMask{__pred}, _WriteOp{__assign}, - /*_IsUniquePattern=*/std::false_type{}); - } - else - { - using _ReduceOp = std::plus<_Size>; - using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>; - using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, - /*inclusive*/ std::true_type, 1>; - - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, - _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); + // Compile the kernels to check if the sub-group size is 32. This should not be necessary per the SYCL spec + // but is needed to check for an IGC workaround for a hardware bug where kernels may be compiled with a + // sub-group size of 16 despite requiring 32. If the wrong sub-group size is used, then fallback to + // multi-pass scan. + __reduce_then_scan_copy_kernels<_ExecutionPolicy, _InRng, _OutRng, _GenMask, + _Size, _WriteOp, std::false_type> + __kernels(__exec); + if (__kernels.__is_compiled_sg32()) + { + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, + _GenMask{__pred}, _WriteOp{__assign}, + /*_IsUniquePattern=*/std::false_type{}); + } } +#endif + using _ReduceOp = std::plus<_Size>; + using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>; + using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, + /*inclusive*/ std::true_type, 1>; + + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, + _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); } template #include #include +#include #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" @@ -275,14 +279,7 @@ class __reduce_then_scan_scan_kernel; template -struct __parallel_reduce_then_scan_reduce_submitter; - -template -struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive, - __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType, - __internal::__optional_kernel_name<_KernelName...>> +struct __parallel_reduce_then_scan_reduce_submitter { // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory // input buffer @@ -291,7 +288,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num) const + const std::size_t __block_num, sycl::kernel __reduce_kernel) const { using _InitValueType = typename _InitType::__value_type; return __exec.queue().submit([&, this](sycl::handler& __cgh) { @@ -412,14 +409,7 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu template -struct __parallel_reduce_then_scan_scan_submitter; - -template -struct __parallel_reduce_then_scan_scan_submitter< - __sub_group_size, __max_inputs_per_item, __is_inclusive, __is_unique_pattern_v, _ReduceOp, _GenScanInput, - _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>> +struct __parallel_reduce_then_scan_scan_submitter { using _InitValueType = typename _InitType::__value_type; @@ -442,7 +432,7 @@ struct __parallel_reduce_then_scan_scan_submitter< operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num) const + const std::size_t __block_num, sycl::kernel __scan_kernel) const { std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size}); std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div( @@ -726,6 +716,34 @@ struct __parallel_reduce_then_scan_scan_submitter< _InitType __init; }; +// We accept a set of variadic types to disambiguate between the different scan kernels. The set +// of template parameters for __parallel_transform_reduce_then_scan here is expected to be used. +template +struct __reduce_then_scan_kernels +{ + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + __reduce_then_scan_reduce_kernel, _CustomName, ParamTypes...>; + using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + __reduce_then_scan_scan_kernel, _CustomName, ParamTypes...>; + explicit __reduce_then_scan_kernels(const _ExecutionPolicy& __exec) + : __exec(__exec) + , __kernels(__internal::__kernel_compiler<_ReduceKernel, _ScanKernel>::__compile(__exec)) + { + } + sycl::kernel __get_reduce_kernel() const { return __kernels[0]; } + sycl::kernel __get_scan_kernel() const { return __kernels[1]; } + bool __is_compiled_sg32() const + { + return oneapi::dpl::__internal::__kernel_sub_group_size(__exec, __get_reduce_kernel()) == std::uint32_t{32} && + oneapi::dpl::__internal::__kernel_sub_group_size(__exec, __get_scan_kernel()) == std::uint32_t{32}; + } +private: + // idx 0 is the reduce kernel and idx 1 is the scan kernel + std::array __kernels; + const _ExecutionPolicy& __exec; +}; + // reduce_then_scan requires subgroup size of 32, and performs well only on devices with fast coordinated subgroup // operations. We do not want to run this scan on CPU targets, as they are not performant with this algorithm. template @@ -746,21 +764,17 @@ __is_gpu_with_sg_32(const _ExecutionPolicy& __exec) // _ReduceOp - a binary function which is used in the reduction and scan operations // _WriteOp - a function which accepts output range, index, and output of `_GenScanInput` applied to the input range // and performs the final write to output operation -template auto __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, + _Kernels& __kernels, _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input, _ReduceOp __reduce_op, _GenScanInput __gen_scan_input, _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init, _Inclusive, _IsUniquePattern) { - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __reduce_then_scan_reduce_kernel<_CustomName>>; - using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< - __reduce_then_scan_scan_kernel<_CustomName>>; using _ValueType = typename _InitType::__value_type; constexpr std::uint8_t __sub_group_size = 32; @@ -811,11 +825,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ using _ReduceSubmitter = __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType, - _ReduceKernel>; + typename _Kernels::_ReduceKernel>; using _ScanSubmitter = __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, __is_unique_pattern_v, _ReduceOp, _GenScanInput, _ScanInputTransform, - _WriteOp, _InitType, _ScanKernel>; + _WriteOp, _InitType, typename _Kernels::_ScanKernel>; _ReduceSubmitter __reduce_submitter{__max_inputs_per_block, __num_sub_groups_local, __num_sub_groups_global, @@ -849,10 +863,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range); // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory. __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b); + __inputs_per_sub_group, __inputs_per_item, __b, __kernels.__get_reduce_kernel()); // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan. __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b); + __inputs_per_sub_group, __inputs_per_item, __b, __kernels.__get_scan_kernel()); __inputs_remaining -= std::min(__inputs_remaining, __block_size); // We only need to resize these parameters prior to the last block as it is the only non-full case. if (__b + 2 == __num_blocks) @@ -872,4 +886,5 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ } // namespace dpl } // namespace oneapi +#endif #endif // _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H From 466040746a8f201aad3e15ebcddea296891e6ae1 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 22 Nov 2024 12:10:14 -0600 Subject: [PATCH 02/28] Restore branch and fix bad rebase Signed-off-by: Matthew Michel --- .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 56f06e3c15a..b331bd14b68 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -297,7 +297,7 @@ struct __parallel_reduce_then_scan_reduce_submitter oneapi::dpl::__ranges::__require_access(__cgh, __in_rng); auto __temp_acc = __scratch_container.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); - __cgh.parallel_for<_KernelName...>( + __cgh.parallel_for<_KernelName>( __nd_range, [=, *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] { _InitValueType* __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc); std::size_t __group_id = __ndi.get_group(0); @@ -448,7 +448,7 @@ struct __parallel_reduce_then_scan_scan_submitter auto __res_acc = __scratch_container.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); - __cgh.parallel_for<_KernelName...>( + __cgh.parallel_for<_KernelName>( __nd_range, [=, *this] (sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] { _InitValueType* __tmp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc); _InitValueType* __res_ptr = From 40fb286e63ff9f9944c41062ec11e1654eda18f2 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Thu, 9 Jan 2025 14:08:00 -0600 Subject: [PATCH 03/28] Rough draft of workaround with new driver behavior Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 202 +++++++++--------- .../parallel_backend_sycl_reduce_then_scan.h | 37 ++-- .../dpcpp/parallel_backend_sycl_utils.h | 33 +++ .../alg.set.operations/set_common.h | 4 +- 4 files changed, 158 insertions(+), 118 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 6f46fe19640..cbe1baf8f89 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1090,20 +1090,6 @@ struct __write_to_id_if_else _Assign __assign; }; -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT -// Templated alias to easily reference reduce-then-scan-copy kernels. -template -using __reduce_then_scan_copy_kernels = - __reduce_then_scan_kernels<_ExecutionPolicy, _Range1, _Range2, - /*_GenReduceInput=*/oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>, - /*_ReduceOp=*/std::plus<_Size>, - /*_GenScanInput=*/oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>, - /*_ScanInputTransform=*/oneapi::dpl::__par_backend_hetero::__get_zeroth_element, - _WriteOp, oneapi::dpl::unseq_backend::__no_init_value<_Size>, - /*_Inclusive=*/std::true_type, _IsUniquePattern>; -#endif - template auto @@ -1117,7 +1103,12 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen // work-group implementation requires a fundamental type which must also be trivially copyable. if constexpr (std::is_trivially_copyable_v<_Type>) { - bool __use_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec); + bool __use_reduce_then_scan = +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT + oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec); +#else + false; +#endif // TODO: Consider re-implementing single group scan to support types without known identities. This could also // allow us to use single wg scan for the last block of reduce-then-scan if it is sufficiently small. @@ -1136,27 +1127,26 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{}); } } -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT if (__use_reduce_then_scan) { using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>; using _ScanInputTransform = oneapi::dpl::__internal::__no_op; using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_id; - // Compile the kernels to check if the sub-group size is 32. This should not be necessary per the SYCL spec - // but is needed to check for an IGC workaround for a hardware bug where kernels may be compiled with a - // sub-group size of 16 despite requiring 32. If the wrong sub-group size is used, then fallback to - // multi-pass scan. - _GenInput __gen_transform{__unary_op}; - __reduce_then_scan_kernels<_ExecutionPolicy, _Range1, _Range2, decltype(__gen_transform), _BinaryOperation, decltype(__gen_transform), _ScanInputTransform, - _WriteOp, _InitType, _Inclusive, std::false_type> __kernels(__exec); - if (__kernels.__is_compiled_sg32()) - { + auto [__opt_return, _] = __handle_sync_sycl_exception([&] { + _GenInput __gen_transform{__unary_op}; + __reduce_then_scan_kernels<_ExecutionPolicy, _Range1, _Range2, decltype(__gen_transform), + _BinaryOperation, decltype(__gen_transform), _ScanInputTransform, _WriteOp, + _InitType, _Inclusive, std::false_type> + __kernels(__exec); return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); - } + }); + if (__opt_return) + return __opt_return.value(); } #endif } @@ -1233,18 +1223,22 @@ struct __invoke_single_group_copy_if } }; -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT -template auto __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, - _Kernels& __kernels, _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask, + _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _GenMask __generate_mask, _WriteOp __write_op, _IsUniquePattern __is_unique_pattern) { using _GenReduceInput = oneapi::dpl::__par_backend_hetero::__gen_count_mask<_GenMask>; using _ReduceOp = std::plus<_Size>; using _GenScanInput = oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>; using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element; + __reduce_then_scan_kernels<_ExecutionPolicy, _InRng, _OutRng, _GenReduceInput, _ReduceOp, _GenScanInput, + _ScanInputTransform, _WriteOp, oneapi::dpl::unseq_backend::__no_init_value<_Size>, + /*_Inclusive*/ std::true_type, _IsUniquePattern> + __kernels(__exec); return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, std::forward<_InRng>(__in_rng), @@ -1305,25 +1299,19 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t // can simply copy the input range to the output. assert(__n > 1); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; - using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>; - // Compile the kernels to check if the sub-group size is 32. This should not be necessary per the SYCL spec - // but is needed to check for an IGC workaround for a hardware bug where kernels may be compiled with a - // sub-group size of 16 despite requiring 32. If the wrong sub-group size is used, then fallback to - // multi-pass scan. - __reduce_then_scan_copy_kernels<_ExecutionPolicy, _Range1, _Range2, _GenMask, - oneapi::dpl::__internal::__difference_t<_Range1>, _WriteOp, std::true_type> - __kernels(__exec); - if (__kernels.__is_compiled_sg32()) - { - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, + auto [__opt_return, _] = __handle_sync_sycl_exception([&] { + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; + using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>; + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, _GenMask{__pred}, _WriteOp{_Assign{}}, /*_IsUniquePattern=*/std::true_type{}); - } + }); + if (__opt_return) + return __opt_return.value(); } #endif using _ReduceOp = std::plus; @@ -1338,6 +1326,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t _CopyOp{_ReduceOp{}, _Assign{}}); } +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT template auto @@ -1357,11 +1346,20 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ // Writes current segment's output reduction and the next segment's output key using _WriteOp = __write_red_by_seg<_BinaryPredicate>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; + using _Zip1Type = + decltype(oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values))); + using _Zip2Type = decltype(oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values))); + using __reduce_by_segment_kernels = __reduce_then_scan_kernels< + _ExecutionPolicy, _Zip1Type, _Zip2Type, _GenReduceInput, _ReduceOp, _GenScanInput, _ScanInputTransform, + _WriteOp, oneapi::dpl::unseq_backend::__no_init_value>, + /*_Inclusive=*/std::true_type, std::false_type>; + __reduce_by_segment_kernels __kernels(__exec); std::size_t __n = __keys.size(); // __gen_red_by_seg_scan_input requires that __n > 1 assert(__n > 1); return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, @@ -1369,6 +1367,7 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } +#endif template auto @@ -1376,25 +1375,20 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred) { oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size(); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; - using _WriteOp = - oneapi::dpl::__par_backend_hetero::__write_to_id_if_else; - // Compile the kernels to check if the sub-group size is 32. This should not be necessary per the SYCL spec - // but is needed to check for an IGC workaround for a hardware bug where kernels may be compiled with a - // sub-group size of 16 despite requiring 32. If the wrong sub-group size is used, then fallback to - // multi-pass scan. - __reduce_then_scan_copy_kernels<_ExecutionPolicy, _Range1, _Range2, _GenMask, - oneapi::dpl::__internal::__difference_t<_Range1>, _WriteOp, std::false_type> - __kernels(__exec); - if (__kernels.__is_compiled_sg32()) - { - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, + auto [__opt_return, _] = __handle_sync_sycl_exception([&] { + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; + using _WriteOp = + oneapi::dpl::__par_backend_hetero::__write_to_id_if_else; + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, - _GenMask{__pred}, _WriteOp{}, /*_IsUniquePattern=*/std::false_type{}); - } + _GenMask{__pred}, _WriteOp{}, + /*_IsUniquePattern=*/std::false_type{}); + }); + if (__opt_return) + return __opt_return.value(); } #endif using _ReduceOp = std::plus; @@ -1436,25 +1430,19 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __pred, __assign); } -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; - using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>; - // Compile the kernels to check if the sub-group size is 32. This should not be necessary per the SYCL spec - // but is needed to check for an IGC workaround for a hardware bug where kernels may be compiled with a - // sub-group size of 16 despite requiring 32. If the wrong sub-group size is used, then fallback to - // multi-pass scan. - __reduce_then_scan_copy_kernels<_ExecutionPolicy, _InRng, _OutRng, _GenMask, - _Size, _WriteOp, std::false_type> - __kernels(__exec); - if (__kernels.__is_compiled_sg32()) - { - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, - _GenMask{__pred}, _WriteOp{__assign}, + auto [__opt_return, _] = __handle_sync_sycl_exception([&] { + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; + using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>; + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), + __n, _GenMask{__pred}, _WriteOp{__assign}, /*_IsUniquePattern=*/std::false_type{}); - } + }); + if (__opt_return) + return __opt_return.value(); } #endif using _ReduceOp = std::plus<_Size>; @@ -1467,6 +1455,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); } +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT template auto @@ -1489,18 +1478,25 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __ using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element; oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, std::int32_t> __mask_buf(__exec, __rng1.size()); + auto __zipped = oneapi::dpl::__ranges::make_zip_view( + std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + oneapi::dpl::__ranges::all_view( + __mask_buf.get_buffer())); + + using __set_kernels = __reduce_then_scan_kernels<_ExecutionPolicy, decltype(__zipped), _Range3, _GenReduceInput, + _GenScanInput, _ScanRangeTransform, _ScanInputTransform, _WriteOp, + oneapi::dpl::unseq_backend::__no_init_value<_Size>, std::true_type, + std::false_type, std::decay_t<_Compare>>; + __set_kernels __kernels(__exec); return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), - oneapi::dpl::__ranges::make_zip_view( - std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - oneapi::dpl::__ranges::all_view( - __mask_buf.get_buffer())), - std::forward<_Range3>(__result), _GenReduceInput{_GenMaskReduce{__comp}}, _ReduceOp{}, + __backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, __zipped, std::forward<_Range3>(__result), + _GenReduceInput{_GenMaskReduce{__comp}}, _ReduceOp{}, _GenScanInput{_GenMaskScan{_MaskPredicate{}, _MaskRangeTransform{}}, _ScanRangeTransform{}}, _ScanInputTransform{}, _WriteOp{}, oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, /*_Inclusive=*/std::true_type{}, /*__is_unique_pattern=*/std::false_type{}); } +#endif template @@ -1561,16 +1557,17 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ { if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__result), __comp, __is_op_difference); - } - else - { - return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), - std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp, - __is_op_difference); + auto [__opt_return, _] = __handle_sync_sycl_exception([&] { + return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__result), __comp, __is_op_difference); + }); + if (__opt_return) + return __opt_return.value(); } + return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp, + __is_op_difference); } //------------------------------------------------------------------------ @@ -2498,19 +2495,24 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; // Prior to icpx 2025.0, the reduce-then-scan path performs poorly and should be avoided. -#if !defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000 +#if (!defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000) && \ + (_ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT) if constexpr (std::is_trivially_copyable_v<__val_type>) { if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), - std::forward<_Range4>(__out_values), __binary_pred, __binary_op); - __res.wait(); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; + auto [__opt_return, _] = __handle_sync_sycl_exception([&] { + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + }); + if (__opt_return) + return __opt_return.value(); } } #endif diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index b331bd14b68..8255150a532 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -17,20 +17,20 @@ #define _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H // Kernel bundles are required to use this header -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT -#include -#include -#include -#include +# include +# include +# include +# include -#include "sycl_defs.h" -#include "parallel_backend_sycl_utils.h" -#include "execution_sycl_defs.h" -#include "unseq_backend_sycl.h" -#include "utils_ranges_sycl.h" +# include "sycl_defs.h" +# include "parallel_backend_sycl_utils.h" +# include "execution_sycl_defs.h" +# include "unseq_backend_sycl.h" +# include "utils_ranges_sycl.h" -#include "../../utils.h" +# include "../../utils.h" namespace oneapi { @@ -297,6 +297,7 @@ struct __parallel_reduce_then_scan_reduce_submitter oneapi::dpl::__ranges::__require_access(__cgh, __in_rng); auto __temp_acc = __scratch_container.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); + __cgh.use_kernel_bundle(__reduce_kernel.get_kernel_bundle()); __cgh.parallel_for<_KernelName>( __nd_range, [=, *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] { _InitValueType* __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc); @@ -447,7 +448,7 @@ struct __parallel_reduce_then_scan_scan_submitter auto __temp_acc = __scratch_container.template __get_scratch_acc(__cgh); auto __res_acc = __scratch_container.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); - + __cgh.use_kernel_bundle(__scan_kernel.get_kernel_bundle()); __cgh.parallel_for<_KernelName>( __nd_range, [=, *this] (sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] { _InitValueType* __tmp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc); @@ -718,14 +719,16 @@ struct __parallel_reduce_then_scan_scan_submitter // We accept a set of variadic types to disambiguate between the different scan kernels. The set // of template parameters for __parallel_transform_reduce_then_scan here is expected to be used. -template +template struct __reduce_then_scan_kernels { using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - __reduce_then_scan_reduce_kernel, _CustomName, ParamTypes...>; - using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - __reduce_then_scan_scan_kernel, _CustomName, ParamTypes...>; + using _ReduceKernel = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__reduce_then_scan_reduce_kernel, + _CustomName, _ParamTypes...>; + using _ScanKernel = + oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__reduce_then_scan_scan_kernel, + _CustomName, _ParamTypes...>; explicit __reduce_then_scan_kernels(const _ExecutionPolicy& __exec) : __exec(__exec) , __kernels(__internal::__kernel_compiler<_ReduceKernel, _ScanKernel>::__compile(__exec)) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 348484f1477..956326183be 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -842,6 +842,39 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, } }; +// This exception handler is intended to handle a software workaround by IGC for a hardware bug that +// causes IGC to throw a sycl::errc::kernel_not_supported exception for certain integrated graphics +// devices. +struct __bypass_sycl_kernel_not_supported +{ + void + operator()(const sycl::exception& __e) const + { + // TODO: We are currently just suppressing any synchronous SYCL exception. The best solution + // would be to compare __e.code() and sycl::errc::kernel_not_supported and rethrow the encountered exception + // if the two do not compare equal. However, the icpx compiler currently returns a generic error code + // which is not compliant with the SYCL spec and this approach cannot be used until error code issue is + // resolved. + } +}; + +template +auto +__handle_sync_sycl_exception(_Callable __caller, _Handler __handler = {}) + -> std::tuple, std::error_code> +{ + try + { + return std::make_tuple(__caller(), sycl::errc::success); + } + catch (const sycl::exception& __e) + { + // Handle the error and return an empty optional with the encountered error code. + __handler(__e); + return std::make_tuple(std::optional{}, __e.code()); + } +} + } // namespace __par_backend_hetero } // namespace dpl } // namespace oneapi diff --git a/test/parallel_api/algorithm/alg.sorting/alg.set.operations/set_common.h b/test/parallel_api/algorithm/alg.sorting/alg.set.operations/set_common.h index 4f94364144d..1328997ef83 100644 --- a/test/parallel_api/algorithm/alg.sorting/alg.set.operations/set_common.h +++ b/test/parallel_api/algorithm/alg.sorting/alg.set.operations/set_common.h @@ -190,7 +190,9 @@ struct test_set_intersection auto expect = sequences.first; auto out = sequences.second; auto expect_res = ::std::set_intersection(first1, last1, first2, last2, expect.begin()); - auto res = ::std::set_intersection(exec, first1, last1, first2, last2, out.begin()); + // TODO before merging: Why did create_new_policy_idx have to be added to avoid duplicate kernel names for set intersection only? Is there a bug + // in the reduce-then-scan kernel naming logic? + auto res = ::std::set_intersection(create_new_policy_idx<3>(exec), first1, last1, first2, last2, out.begin()); EXPECT_TRUE(expect_res - expect.begin() == res - out.begin(), "wrong result for set_intersection without comparator"); EXPECT_EQ_N(expect.begin(), out.begin(), ::std::distance(out.begin(), res), "wrong set_intersection effect without comparator"); From b33e132a564cb5bbc4e42c010b2b28118c695690 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 10 Jan 2025 09:48:53 -0600 Subject: [PATCH 04/28] Move kernel naming within __parallel_transform_reduce_then_scan Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 41 ++++----------- .../parallel_backend_sycl_reduce_then_scan.h | 51 ++++++------------- 2 files changed, 24 insertions(+), 68 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index cbe1baf8f89..aab2c33a1a2 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1136,12 +1136,8 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen auto [__opt_return, _] = __handle_sync_sycl_exception([&] { _GenInput __gen_transform{__unary_op}; - __reduce_then_scan_kernels<_ExecutionPolicy, _Range1, _Range2, decltype(__gen_transform), - _BinaryOperation, decltype(__gen_transform), _ScanInputTransform, _WriteOp, - _InitType, _Inclusive, std::false_type> - __kernels(__exec); return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, std::forward<_Range1>(__in_rng), + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); }); @@ -1235,13 +1231,9 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _ using _ReduceOp = std::plus<_Size>; using _GenScanInput = oneapi::dpl::__par_backend_hetero::__gen_expand_count_mask<_GenMask>; using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element; - __reduce_then_scan_kernels<_ExecutionPolicy, _InRng, _OutRng, _GenReduceInput, _ReduceOp, _GenScanInput, - _ScanInputTransform, _WriteOp, oneapi::dpl::unseq_backend::__no_init_value<_Size>, - /*_Inclusive*/ std::true_type, _IsUniquePattern> - __kernels(__exec); return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, std::forward<_InRng>(__in_rng), + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), _GenReduceInput{__generate_mask}, _ReduceOp{}, _GenScanInput{__generate_mask}, _ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, /*_Inclusive=*/std::true_type{}, __is_unique_pattern); @@ -1346,20 +1338,11 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ // Writes current segment's output reduction and the next segment's output key using _WriteOp = __write_red_by_seg<_BinaryPredicate>; using _ValueType = oneapi::dpl::__internal::__value_t<_Range2>; - using _Zip1Type = - decltype(oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values))); - using _Zip2Type = decltype(oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), - std::forward<_Range4>(__out_values))); - using __reduce_by_segment_kernels = __reduce_then_scan_kernels< - _ExecutionPolicy, _Zip1Type, _Zip2Type, _GenReduceInput, _ReduceOp, _GenScanInput, _ScanInputTransform, - _WriteOp, oneapi::dpl::unseq_backend::__no_init_value>, - /*_Inclusive=*/std::true_type, std::false_type>; - __reduce_by_segment_kernels __kernels(__exec); std::size_t __n = __keys.size(); // __gen_red_by_seg_scan_input requires that __n > 1 assert(__n > 1); return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, + __backend_tag, std::forward<_ExecutionPolicy>(__exec), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, @@ -1478,20 +1461,14 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __ using _ScanInputTransform = oneapi::dpl::__par_backend_hetero::__get_zeroth_element; oneapi::dpl::__par_backend_hetero::__buffer<_ExecutionPolicy, std::int32_t> __mask_buf(__exec, __rng1.size()); - auto __zipped = oneapi::dpl::__ranges::make_zip_view( - std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - oneapi::dpl::__ranges::all_view( - __mask_buf.get_buffer())); - - using __set_kernels = __reduce_then_scan_kernels<_ExecutionPolicy, decltype(__zipped), _Range3, _GenReduceInput, - _GenScanInput, _ScanRangeTransform, _ScanInputTransform, _WriteOp, - oneapi::dpl::unseq_backend::__no_init_value<_Size>, std::true_type, - std::false_type, std::decay_t<_Compare>>; - __set_kernels __kernels(__exec); return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), __kernels, __zipped, std::forward<_Range3>(__result), - _GenReduceInput{_GenMaskReduce{__comp}}, _ReduceOp{}, + __backend_tag, std::forward<_ExecutionPolicy>(__exec), + oneapi::dpl::__ranges::make_zip_view( + std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + oneapi::dpl::__ranges::all_view( + __mask_buf.get_buffer())), + std::forward<_Range3>(__result), _GenReduceInput{_GenMaskReduce{__comp}}, _ReduceOp{}, _GenScanInput{_GenMaskScan{_MaskPredicate{}, _MaskRangeTransform{}}, _ScanRangeTransform{}}, _ScanInputTransform{}, _WriteOp{}, oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, /*_Inclusive=*/std::true_type{}, /*__is_unique_pattern=*/std::false_type{}); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 8255150a532..6a28f82f5ea 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -717,36 +717,6 @@ struct __parallel_reduce_then_scan_scan_submitter _InitType __init; }; -// We accept a set of variadic types to disambiguate between the different scan kernels. The set -// of template parameters for __parallel_transform_reduce_then_scan here is expected to be used. -template -struct __reduce_then_scan_kernels -{ - using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - using _ReduceKernel = - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__reduce_then_scan_reduce_kernel, - _CustomName, _ParamTypes...>; - using _ScanKernel = - oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<__reduce_then_scan_scan_kernel, - _CustomName, _ParamTypes...>; - explicit __reduce_then_scan_kernels(const _ExecutionPolicy& __exec) - : __exec(__exec) - , __kernels(__internal::__kernel_compiler<_ReduceKernel, _ScanKernel>::__compile(__exec)) - { - } - sycl::kernel __get_reduce_kernel() const { return __kernels[0]; } - sycl::kernel __get_scan_kernel() const { return __kernels[1]; } - bool __is_compiled_sg32() const - { - return oneapi::dpl::__internal::__kernel_sub_group_size(__exec, __get_reduce_kernel()) == std::uint32_t{32} && - oneapi::dpl::__internal::__kernel_sub_group_size(__exec, __get_scan_kernel()) == std::uint32_t{32}; - } -private: - // idx 0 is the reduce kernel and idx 1 is the scan kernel - std::array __kernels; - const _ExecutionPolicy& __exec; -}; - // reduce_then_scan requires subgroup size of 32, and performs well only on devices with fast coordinated subgroup // operations. We do not want to run this scan on CPU targets, as they are not performant with this algorithm. template @@ -767,18 +737,27 @@ __is_gpu_with_sg_32(const _ExecutionPolicy& __exec) // _ReduceOp - a binary function which is used in the reduction and scan operations // _WriteOp - a function which accepts output range, index, and output of `_GenScanInput` applied to the input range // and performs the final write to output operation -template auto __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, - _Kernels& __kernels, _InRng&& __in_rng, _OutRng&& __out_rng, _GenReduceInput __gen_reduce_input, _ReduceOp __reduce_op, _GenScanInput __gen_scan_input, _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init, _Inclusive, _IsUniquePattern) { using _ValueType = typename _InitType::__value_type; + using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + __reduce_then_scan_reduce_kernel, _CustomName, _InRng, _OutRng, _GenReduceInput, _ReduceOp, _InitType, + _Inclusive, _IsUniquePattern>; + using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< + __reduce_then_scan_scan_kernel, _CustomName, _InRng, _OutRng, _GenScanInput, _ScanInputTransform, _WriteOp, + _InitType, _Inclusive, _IsUniquePattern>; + auto __kernels = __internal::__kernel_compiler<_ReduceKernel, _ScanKernel>::__compile(__exec); + sycl::kernel& __reduce_kernel = __kernels[0]; + sycl::kernel& __scan_kernel = __kernels[1]; constexpr std::uint8_t __sub_group_size = 32; constexpr std::uint8_t __block_size_scale = std::max(std::size_t{1}, sizeof(double) / sizeof(_ValueType)); @@ -828,11 +807,11 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ using _ReduceSubmitter = __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType, - typename _Kernels::_ReduceKernel>; + _ReduceKernel>; using _ScanSubmitter = __parallel_reduce_then_scan_scan_submitter<__sub_group_size, __max_inputs_per_item, __inclusive, __is_unique_pattern_v, _ReduceOp, _GenScanInput, _ScanInputTransform, - _WriteOp, _InitType, typename _Kernels::_ScanKernel>; + _WriteOp, _InitType, _ScanKernel>; _ReduceSubmitter __reduce_submitter{__max_inputs_per_block, __num_sub_groups_local, __num_sub_groups_global, @@ -866,10 +845,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range); // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory. __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b, __kernels.__get_reduce_kernel()); + __inputs_per_sub_group, __inputs_per_item, __b, __reduce_kernel); // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan. __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b, __kernels.__get_scan_kernel()); + __inputs_per_sub_group, __inputs_per_item, __b, __scan_kernel); __inputs_remaining -= std::min(__inputs_remaining, __block_size); // We only need to resize these parameters prior to the last block as it is the only non-full case. if (__b + 2 == __num_blocks) From c8f8dcdf6c4b9458513f96e9b5168aa492e3cda3 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 10 Jan 2025 10:14:12 -0600 Subject: [PATCH 05/28] Add fallback for kernel compilation if bundles are not present Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 19 +++++++++---------- .../parallel_backend_sycl_reduce_then_scan.h | 18 ++++++++++++++---- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index aab2c33a1a2..ecb7c31f720 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1104,7 +1104,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen if constexpr (std::is_trivially_copyable_v<_Type>) { bool __use_reduce_then_scan = -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec); #else false; @@ -1127,7 +1127,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{}); } } -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL if (__use_reduce_then_scan) { using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>; @@ -1219,7 +1219,7 @@ struct __invoke_single_group_copy_if } }; -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL template auto @@ -1291,7 +1291,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t // can simply copy the input range to the output. assert(__n > 1); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto [__opt_return, _] = __handle_sync_sycl_exception([&] { @@ -1318,7 +1318,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t _CopyOp{_ReduceOp{}, _Assign{}}); } -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL template auto @@ -1358,7 +1358,7 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred) { oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size(); -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto [__opt_return, _] = __handle_sync_sycl_exception([&] { @@ -1413,7 +1413,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __pred, __assign); } -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto [__opt_return, _] = __handle_sync_sycl_exception([&] { @@ -1438,7 +1438,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); } -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if _ONEDPL_COMPILE_KERNEL template auto @@ -2472,8 +2472,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; // Prior to icpx 2025.0, the reduce-then-scan path performs poorly and should be avoided. -#if (!defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000) && \ - (_ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT) +#if (!defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000) && _ONEDPL_COMPILE_KERNEL if constexpr (std::is_trivially_copyable_v<__val_type>) { if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 6a28f82f5ea..c3a540602d9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -16,8 +16,8 @@ #ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H #define _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H -// Kernel bundles are required to use this header -#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +// Kernel compilation must be supported to properly work around hardware bug on certain iGPUs +#if _ONEDPL_COMPILE_KERNEL # include # include @@ -288,7 +288,7 @@ struct __parallel_reduce_then_scan_reduce_submitter operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num, sycl::kernel __reduce_kernel) const + const std::size_t __block_num, sycl::kernel& __reduce_kernel) const { using _InitValueType = typename _InitType::__value_type; return __exec.queue().submit([&, this](sycl::handler& __cgh) { @@ -297,8 +297,13 @@ struct __parallel_reduce_then_scan_reduce_submitter oneapi::dpl::__ranges::__require_access(__cgh, __in_rng); auto __temp_acc = __scratch_container.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); +#if _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT __cgh.use_kernel_bundle(__reduce_kernel.get_kernel_bundle()); +#endif __cgh.parallel_for<_KernelName>( +#if !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT + __reduce_kernel, +#endif __nd_range, [=, *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] { _InitValueType* __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc); std::size_t __group_id = __ndi.get_group(0); @@ -433,7 +438,7 @@ struct __parallel_reduce_then_scan_scan_submitter operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num, sycl::kernel __scan_kernel) const + const std::size_t __block_num, sycl::kernel& __scan_kernel) const { std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size}); std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div( @@ -448,8 +453,13 @@ struct __parallel_reduce_then_scan_scan_submitter auto __temp_acc = __scratch_container.template __get_scratch_acc(__cgh); auto __res_acc = __scratch_container.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); +#if _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT __cgh.use_kernel_bundle(__scan_kernel.get_kernel_bundle()); +#endif __cgh.parallel_for<_KernelName>( +#if !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT + __scan_kernel, +#endif __nd_range, [=, *this] (sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] { _InitValueType* __tmp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc); _InitValueType* __res_ptr = From 28d82e40556916fc889b28ad10404a0ed824d464 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 10 Jan 2025 10:14:40 -0600 Subject: [PATCH 06/28] Revert unnecessary clang-format change Signed-off-by: Matthew Michel --- .../parallel_backend_sycl_reduce_then_scan.h | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index c3a540602d9..b52f3b0c0f9 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -19,18 +19,18 @@ // Kernel compilation must be supported to properly work around hardware bug on certain iGPUs #if _ONEDPL_COMPILE_KERNEL -# include -# include -# include -# include - -# include "sycl_defs.h" -# include "parallel_backend_sycl_utils.h" -# include "execution_sycl_defs.h" -# include "unseq_backend_sycl.h" -# include "utils_ranges_sycl.h" - -# include "../../utils.h" +#include +#include +#include +#include + +#include "sycl_defs.h" +#include "parallel_backend_sycl_utils.h" +#include "execution_sycl_defs.h" +#include "unseq_backend_sycl.h" +#include "utils_ranges_sycl.h" + +#include "../../utils.h" namespace oneapi { From 4c829947d5ff31da26b9e3625841fac8c4dfe8bf Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 10 Jan 2025 11:12:19 -0600 Subject: [PATCH 07/28] Adjust sync exception handler and properly catch unsupported kernel exception Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 108 ++++++++++-------- .../dpcpp/parallel_backend_sycl_utils.h | 30 +++-- 2 files changed, 79 insertions(+), 59 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ecb7c31f720..3523e1627ec 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1134,13 +1134,16 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen using _ScanInputTransform = oneapi::dpl::__internal::__no_op; using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_id; - auto [__opt_return, _] = __handle_sync_sycl_exception([&] { - _GenInput __gen_transform{__unary_op}; - return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, - _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); - }); + auto __opt_return = __handle_sync_sycl_exception( + [&] { + _GenInput __gen_transform{__unary_op}; + return __parallel_transform_reduce_then_scan( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), + std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, + _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{}, + /*_IsUniquePattern=*/std::false_type{}); + }, + oneapi::dpl::__par_backend_hetero::__bypass_sycl_kernel_not_supported{}); if (__opt_return) return __opt_return.value(); } @@ -1294,14 +1297,16 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t #if _ONEDPL_COMPILE_KERNEL if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto [__opt_return, _] = __handle_sync_sycl_exception([&] { - using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; - using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>; - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, - _GenMask{__pred}, _WriteOp{_Assign{}}, - /*_IsUniquePattern=*/std::true_type{}); - }); + auto __opt_return = __handle_sync_sycl_exception( + [&] { + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; + using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>; + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng), std::forward<_Range2>(__result), + __n, _GenMask{__pred}, _WriteOp{_Assign{}}, + /*_IsUniquePattern=*/std::true_type{}); + }, + oneapi::dpl::__par_backend_hetero::__bypass_sycl_kernel_not_supported{}); if (__opt_return) return __opt_return.value(); } @@ -1361,15 +1366,17 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen #if _ONEDPL_COMPILE_KERNEL if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto [__opt_return, _] = __handle_sync_sycl_exception([&] { - using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; - using _WriteOp = - oneapi::dpl::__par_backend_hetero::__write_to_id_if_else; - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, - _GenMask{__pred}, _WriteOp{}, - /*_IsUniquePattern=*/std::false_type{}); - }); + auto __opt_return = __handle_sync_sycl_exception( + [&] { + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; + using _WriteOp = + oneapi::dpl::__par_backend_hetero::__write_to_id_if_else; + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng), std::forward<_Range2>(__result), + __n, _GenMask{__pred}, _WriteOp{}, + /*_IsUniquePattern=*/std::false_type{}); + }, + oneapi::dpl::__par_backend_hetero::__bypass_sycl_kernel_not_supported{}); if (__opt_return) return __opt_return.value(); } @@ -1416,14 +1423,16 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, #if _ONEDPL_COMPILE_KERNEL else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto [__opt_return, _] = __handle_sync_sycl_exception([&] { - using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; - using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>; - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), - __n, _GenMask{__pred}, _WriteOp{__assign}, - /*_IsUniquePattern=*/std::false_type{}); - }); + auto __opt_return = __handle_sync_sycl_exception( + [&] { + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; + using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>; + return __parallel_reduce_then_scan_copy( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), __n, _GenMask{__pred}, _WriteOp{__assign}, + /*_IsUniquePattern=*/std::false_type{}); + }, + __bypass_sycl_kernel_not_supported{}); if (__opt_return) return __opt_return.value(); } @@ -1534,11 +1543,13 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ { if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto [__opt_return, _] = __handle_sync_sycl_exception([&] { - return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__result), __comp, __is_op_difference); - }); + auto __opt_return = __handle_sync_sycl_exception( + [&] { + return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__result), __comp, __is_op_difference); + }, + __bypass_sycl_kernel_not_supported{}); if (__opt_return) return __opt_return.value(); } @@ -2477,16 +2488,19 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe { if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto [__opt_return, _] = __handle_sync_sycl_exception([&] { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), - std::forward<_Range4>(__out_values), __binary_pred, __binary_op); - __res.wait(); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; - }); + auto __opt_return = __handle_sync_sycl_exception( + [&] { + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__keys), std::forward<_Range2>(__values), + std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), __binary_pred, + __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + }, + __bypass_sycl_kernel_not_supported{}); if (__opt_return) return __opt_return.value(); } diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 956326183be..485c70c37b3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -21,6 +21,7 @@ #include #include #include +#include #include "../../iterator_impl.h" @@ -843,35 +844,40 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, }; // This exception handler is intended to handle a software workaround by IGC for a hardware bug that -// causes IGC to throw a sycl::errc::kernel_not_supported exception for certain integrated graphics -// devices. +// causes IGC to throw an exception for certain integrated graphics devices with -O0 compilation and +// a required sub-group size of 32. struct __bypass_sycl_kernel_not_supported { void operator()(const sycl::exception& __e) const { - // TODO: We are currently just suppressing any synchronous SYCL exception. The best solution - // would be to compare __e.code() and sycl::errc::kernel_not_supported and rethrow the encountered exception - // if the two do not compare equal. However, the icpx compiler currently returns a generic error code - // which is not compliant with the SYCL spec and this approach cannot be used until error code issue is - // resolved. + // The SYCL spec compliant solution would be to compare __e.code() and sycl::errc::kernel_not_supported + // and rethrow the encountered exception if the two do not compare equal. However, the icpx compiler currently + // returns a generic error code in violation of the SYCL spec which has a value of 7. If we are using the Intel + // compiler, then compare the value of the error code. Otherwise, assume the implementation is spec compliant. +#ifdef _ONEDPL_LIBSYCL_VERSION // Detects either icpx or the open-source intel/llvm compiler + if (__e.code().value() != 7) + throw __e; +#else // Generic SYCL compiler. Assume it is spec compliant. + if (__e.code() != sycl::errc::kernel_not_supported) + throw __e; +#endif } }; -template +template auto -__handle_sync_sycl_exception(_Callable __caller, _Handler __handler = {}) - -> std::tuple, std::error_code> +__handle_sync_sycl_exception(_Callable __caller, _Handler __handler) -> std::optional { try { - return std::make_tuple(__caller(), sycl::errc::success); + return __caller(); } catch (const sycl::exception& __e) { // Handle the error and return an empty optional with the encountered error code. __handler(__e); - return std::make_tuple(std::optional{}, __e.code()); + return {}; } } From 730b83eba5c8da72748fdbd7fd27c4ca5c468eb2 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 10 Jan 2025 11:17:39 -0600 Subject: [PATCH 08/28] Adjust lambda capture clauses to only capture forwarded fields by reference Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 3523e1627ec..aa44042f223 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1135,7 +1135,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_id; auto __opt_return = __handle_sync_sycl_exception( - [&] { + [=, &__exec, &__in_rng, &__out_rng] { _GenInput __gen_transform{__unary_op}; return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), @@ -1298,7 +1298,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto __opt_return = __handle_sync_sycl_exception( - [&] { + [=, &__exec, &__rng, &__result] { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>; return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), @@ -1367,7 +1367,7 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto __opt_return = __handle_sync_sycl_exception( - [&] { + [=, &__exec, &__rng, &__result] { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if_else; @@ -1424,7 +1424,7 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto __opt_return = __handle_sync_sycl_exception( - [&] { + [=, &__exec, &__in_rng, &__out_rng] { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>; return __parallel_reduce_then_scan_copy( @@ -1544,7 +1544,7 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto __opt_return = __handle_sync_sycl_exception( - [&] { + [=, &__exec, &__rng1, &__rng2, &__result] { return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp, __is_op_difference); @@ -2489,7 +2489,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto __opt_return = __handle_sync_sycl_exception( - [&] { + [=, &__exec, &__keys, &__values, &__out_keys, &__out_values] { auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__keys), std::forward<_Range2>(__values), From 2dd9853de685dab5d8969d83d77e37912ace1776 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 10 Jan 2025 11:21:04 -0600 Subject: [PATCH 09/28] Remove unnecessary namespace fields Signed-off-by: Matthew Michel --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index aa44042f223..10ba2fb1278 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1143,7 +1143,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); }, - oneapi::dpl::__par_backend_hetero::__bypass_sycl_kernel_not_supported{}); + __bypass_sycl_kernel_not_supported{}); if (__opt_return) return __opt_return.value(); } @@ -1306,7 +1306,7 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t __n, _GenMask{__pred}, _WriteOp{_Assign{}}, /*_IsUniquePattern=*/std::true_type{}); }, - oneapi::dpl::__par_backend_hetero::__bypass_sycl_kernel_not_supported{}); + __bypass_sycl_kernel_not_supported{}); if (__opt_return) return __opt_return.value(); } @@ -1376,7 +1376,7 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen __n, _GenMask{__pred}, _WriteOp{}, /*_IsUniquePattern=*/std::false_type{}); }, - oneapi::dpl::__par_backend_hetero::__bypass_sycl_kernel_not_supported{}); + __bypass_sycl_kernel_not_supported{}); if (__opt_return) return __opt_return.value(); } From 664531ee271ec4c95de4e00eb097c4dd5c1cd69c Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 10 Jan 2025 11:29:31 -0600 Subject: [PATCH 10/28] Update comment and add missing preprocessor guard Signed-off-by: Matthew Michel --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 ++ .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 10ba2fb1278..d68351d2376 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1541,6 +1541,7 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ _Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp, _IsOpDifference __is_op_difference) { +#if _ONEDPL_COMPILER_KERNEL if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto __opt_return = __handle_sync_sycl_exception( @@ -1553,6 +1554,7 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ if (__opt_return) return __opt_return.value(); } +#endif return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp, __is_op_difference); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 485c70c37b3..838b829de7c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -875,7 +875,7 @@ __handle_sync_sycl_exception(_Callable __caller, _Handler __handler) -> std::opt } catch (const sycl::exception& __e) { - // Handle the error and return an empty optional with the encountered error code. + // Handle the error and return an empty std::optional __handler(__e); return {}; } From 5b9d495ee6075a1841a68486eeeddeb56200540b Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 10 Jan 2025 12:02:57 -0800 Subject: [PATCH 11/28] Make __kernels static so vector alloc / free occurs once on first call Signed-off-by: Matthew Michel --- .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index b52f3b0c0f9..beb0f6837a3 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -765,9 +765,9 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< __reduce_then_scan_scan_kernel, _CustomName, _InRng, _OutRng, _GenScanInput, _ScanInputTransform, _WriteOp, _InitType, _Inclusive, _IsUniquePattern>; - auto __kernels = __internal::__kernel_compiler<_ReduceKernel, _ScanKernel>::__compile(__exec); - sycl::kernel& __reduce_kernel = __kernels[0]; - sycl::kernel& __scan_kernel = __kernels[1]; + static auto __kernels = __internal::__kernel_compiler<_ReduceKernel, _ScanKernel>::__compile(__exec); + sycl::kernel __reduce_kernel = __kernels[0]; + sycl::kernel __scan_kernel = __kernels[1]; constexpr std::uint8_t __sub_group_size = 32; constexpr std::uint8_t __block_size_scale = std::max(std::size_t{1}, sizeof(double) / sizeof(_ValueType)); From 3b8272394d55fe76ebc5d6bdc63371e9fe072544 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 10 Jan 2025 15:08:13 -0600 Subject: [PATCH 12/28] clang format and small updates Signed-off-by: Matthew Michel --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 6 +++--- .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d68351d2376..d1d3b62127f 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1442,9 +1442,9 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, - _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), __n, _CreateOp{__pred}, + _CopyOp{_ReduceOp{}, __assign}); } #if _ONEDPL_COMPILE_KERNEL diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index beb0f6837a3..ddd09cdad32 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -22,7 +22,6 @@ #include #include #include -#include #include "sycl_defs.h" #include "parallel_backend_sycl_utils.h" @@ -766,8 +765,8 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ __reduce_then_scan_scan_kernel, _CustomName, _InRng, _OutRng, _GenScanInput, _ScanInputTransform, _WriteOp, _InitType, _Inclusive, _IsUniquePattern>; static auto __kernels = __internal::__kernel_compiler<_ReduceKernel, _ScanKernel>::__compile(__exec); - sycl::kernel __reduce_kernel = __kernels[0]; - sycl::kernel __scan_kernel = __kernels[1]; + sycl::kernel& __reduce_kernel = __kernels[0]; + sycl::kernel& __scan_kernel = __kernels[1]; constexpr std::uint8_t __sub_group_size = 32; constexpr std::uint8_t __block_size_scale = std::max(std::size_t{1}, sizeof(double) / sizeof(_ValueType)); From d3d76d3708d0547d346949118af6bde8f51d753a Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Fri, 10 Jan 2025 15:15:20 -0600 Subject: [PATCH 13/28] Update use of throw Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 838b829de7c..73a7d2e2b24 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -857,10 +857,10 @@ struct __bypass_sycl_kernel_not_supported // compiler, then compare the value of the error code. Otherwise, assume the implementation is spec compliant. #ifdef _ONEDPL_LIBSYCL_VERSION // Detects either icpx or the open-source intel/llvm compiler if (__e.code().value() != 7) - throw __e; + throw; #else // Generic SYCL compiler. Assume it is spec compliant. if (__e.code() != sycl::errc::kernel_not_supported) - throw __e; + throw; #endif } }; From 100172f8899f55b6a99b58a97c1149bdad12af41 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 13 Jan 2025 11:15:36 -0600 Subject: [PATCH 14/28] Revert change to set tests Signed-off-by: Matthew Michel --- .../algorithm/alg.sorting/alg.set.operations/set_common.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/parallel_api/algorithm/alg.sorting/alg.set.operations/set_common.h b/test/parallel_api/algorithm/alg.sorting/alg.set.operations/set_common.h index 1328997ef83..4f94364144d 100644 --- a/test/parallel_api/algorithm/alg.sorting/alg.set.operations/set_common.h +++ b/test/parallel_api/algorithm/alg.sorting/alg.set.operations/set_common.h @@ -190,9 +190,7 @@ struct test_set_intersection auto expect = sequences.first; auto out = sequences.second; auto expect_res = ::std::set_intersection(first1, last1, first2, last2, expect.begin()); - // TODO before merging: Why did create_new_policy_idx have to be added to avoid duplicate kernel names for set intersection only? Is there a bug - // in the reduce-then-scan kernel naming logic? - auto res = ::std::set_intersection(create_new_policy_idx<3>(exec), first1, last1, first2, last2, out.begin()); + auto res = ::std::set_intersection(exec, first1, last1, first2, last2, out.begin()); EXPECT_TRUE(expect_res - expect.begin() == res - out.begin(), "wrong result for set_intersection without comparator"); EXPECT_EQ_N(expect.begin(), out.begin(), ::std::distance(out.begin(), res), "wrong set_intersection effect without comparator"); From 6f04c4d726c685de3821ac33902eadee389bb05f Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 13 Jan 2025 11:16:56 -0600 Subject: [PATCH 15/28] Add missing functor to kernel name generator template list Signed-off-by: Matthew Michel --- .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index ddd09cdad32..1c8a3443e00 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -762,8 +762,8 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ __reduce_then_scan_reduce_kernel, _CustomName, _InRng, _OutRng, _GenReduceInput, _ReduceOp, _InitType, _Inclusive, _IsUniquePattern>; using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - __reduce_then_scan_scan_kernel, _CustomName, _InRng, _OutRng, _GenScanInput, _ScanInputTransform, _WriteOp, - _InitType, _Inclusive, _IsUniquePattern>; + __reduce_then_scan_scan_kernel, _CustomName, _InRng, _OutRng, _GenScanInput, _ReduceOp, _ScanInputTransform, + _WriteOp, _InitType, _Inclusive, _IsUniquePattern>; static auto __kernels = __internal::__kernel_compiler<_ReduceKernel, _ScanKernel>::__compile(__exec); sycl::kernel& __reduce_kernel = __kernels[0]; sycl::kernel& __scan_kernel = __kernels[1]; From 35397f13b50bf499ff32d135db6027ee0d8a3546 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 13 Jan 2025 13:34:37 -0600 Subject: [PATCH 16/28] Fix double forwarding issue that caused segfaults Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 41 ++++++++----------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index d1d3b62127f..ec331ec9200 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1135,12 +1135,11 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_id; auto __opt_return = __handle_sync_sycl_exception( - [=, &__exec, &__in_rng, &__out_rng] { + [=, &__exec] { _GenInput __gen_transform{__unary_op}; return __parallel_transform_reduce_then_scan( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, - _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{}, + __backend_tag, __exec, std::move(__in_rng), std::move(__out_rng), __gen_transform, __binary_op, + __gen_transform, _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); }, __bypass_sycl_kernel_not_supported{}); @@ -1298,11 +1297,10 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto __opt_return = __handle_sync_sycl_exception( - [=, &__exec, &__rng, &__result] { + [=, &__exec] { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>; - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng), std::forward<_Range2>(__result), + return __parallel_reduce_then_scan_copy(__backend_tag, __exec, std::move(__rng), std::move(__result), __n, _GenMask{__pred}, _WriteOp{_Assign{}}, /*_IsUniquePattern=*/std::true_type{}); }, @@ -1367,12 +1365,11 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto __opt_return = __handle_sync_sycl_exception( - [=, &__exec, &__rng, &__result] { + [=, &__exec] { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if_else; - return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng), std::forward<_Range2>(__result), + return __parallel_reduce_then_scan_copy(__backend_tag, __exec, std::move(__rng), std::move(__result), __n, _GenMask{__pred}, _WriteOp{}, /*_IsUniquePattern=*/std::false_type{}); }, @@ -1424,13 +1421,12 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto __opt_return = __handle_sync_sycl_exception( - [=, &__exec, &__in_rng, &__out_rng] { + [=, &__exec] { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>; - return __parallel_reduce_then_scan_copy( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), __n, _GenMask{__pred}, _WriteOp{__assign}, - /*_IsUniquePattern=*/std::false_type{}); + return __parallel_reduce_then_scan_copy(__backend_tag, __exec, std::move(__in_rng), + std::move(__out_rng), __n, _GenMask{__pred}, _WriteOp{__assign}, + /*_IsUniquePattern=*/std::false_type{}); }, __bypass_sycl_kernel_not_supported{}); if (__opt_return) @@ -1545,10 +1541,9 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto __opt_return = __handle_sync_sycl_exception( - [=, &__exec, &__rng1, &__rng2, &__result] { - return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), - std::forward<_Range3>(__result), __comp, __is_op_difference); + [=, &__exec] { + return __parallel_set_reduce_then_scan(__backend_tag, __exec, std::move(__rng1), std::move(__rng2), + std::move(__result), __comp, __is_op_difference); }, __bypass_sycl_kernel_not_supported{}); if (__opt_return) @@ -2491,12 +2486,10 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto __opt_return = __handle_sync_sycl_exception( - [=, &__exec, &__keys, &__values, &__out_keys, &__out_values] { + [=, &__exec] { auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), - std::forward<_Range1>(__keys), std::forward<_Range2>(__values), - std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), __binary_pred, - __binary_op); + oneapi::dpl::__internal::__device_backend_tag{}, __exec, std::move(__keys), std::move(__values), + std::move(__out_keys), std::move(__out_values), __binary_pred, __binary_op); __res.wait(); // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the // past-the-end iterator pair of segmented reduction. From 4d454aefacac3e5175eebb59f73ddc351e5f9b93 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 13 Jan 2025 14:56:46 -0600 Subject: [PATCH 17/28] Remove perfect forwarding to work differing return types due to ref Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 46 +++++++++---------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index ec331ec9200..fbe39ce4a9c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1093,7 +1093,7 @@ struct __write_to_id_if_else template auto -__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec, _Range1&& __in_rng, _Range2&& __out_rng, std::size_t __n, _UnaryOperation __unary_op, _InitType __init, _BinaryOperation __binary_op, _Inclusive) { @@ -1122,9 +1122,9 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen std::size_t __single_group_upper_limit = __use_reduce_then_scan ? 2048 : 16384; if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform, __single_group_upper_limit)) { - return __parallel_transform_scan_single_group( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{}); + return __parallel_transform_scan_single_group(__backend_tag, __exec, std::forward<_Range1>(__in_rng), + std::forward<_Range2>(__out_rng), __n, __unary_op, __init, + __binary_op, _Inclusive{}); } } #if _ONEDPL_COMPILE_KERNEL @@ -1160,8 +1160,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen _NoOpFunctor __get_data_op; return __parallel_transform_scan_base( - __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), - std::forward<_Range2>(__out_rng), __init, + __backend_tag, __exec, std::forward<_Range1>(__in_rng), std::forward<_Range2>(__out_rng), __init, // local scan unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner, _Assigner, _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op}, __assign_op, __assign_op, @@ -1283,7 +1282,7 @@ __parallel_scan_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag template auto -__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec, _Range1&& __rng, _Range2&& __result, _BinaryPredicate __pred) { using _Assign = oneapi::dpl::__internal::__pstl_assign; @@ -1315,9 +1314,8 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t decltype(__n)>; using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, - _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}, + return __parallel_scan_copy(__backend_tag, __exec, std::forward<_Range1>(__rng), std::forward<_Range2>(__result), + __n, _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}, _CopyOp{_ReduceOp{}, _Assign{}}); } @@ -1357,7 +1355,7 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ template auto -__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec, _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred) { oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size(); @@ -1382,14 +1380,14 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>; using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>; - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); + return __parallel_scan_copy(__backend_tag, __exec, std::forward<_Range1>(__rng), std::forward<_Range2>(__result), + __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); } template auto -__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec, _InRng&& __in_rng, _OutRng&& __out_rng, _Size __n, _Pred __pred, _Assign __assign = _Assign{}) { using _SingleGroupInvoker = __invoke_single_group_copy_if<_Size>; @@ -1414,8 +1412,8 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, using _SizeBreakpoints = std::integer_sequence; return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch( - _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), __pred, __assign); + _SingleGroupInvoker{}, __n, __exec, __n, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), + __pred, __assign); } #if _ONEDPL_COMPILE_KERNEL else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) @@ -1438,9 +1436,8 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), __n, _CreateOp{__pred}, - _CopyOp{_ReduceOp{}, __assign}); + return __parallel_scan_copy(__backend_tag, __exec, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), + __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); } #if _ONEDPL_COMPILE_KERNEL @@ -1533,7 +1530,7 @@ __parallel_set_scan(oneapi::dpl::__internal::__device_backend_tag __backend_tag, template auto -__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ExecutionPolicy&& __exec, +__parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, const _ExecutionPolicy& __exec, _Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp, _IsOpDifference __is_op_difference) { @@ -1550,9 +1547,8 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ return __opt_return.value(); } #endif - return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), - std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp, - __is_op_difference); + return __parallel_set_scan(__backend_tag, __exec, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__result), __comp, __is_op_difference); } //------------------------------------------------------------------------ @@ -2463,8 +2459,8 @@ __parallel_reduce_by_segment_fallback(oneapi::dpl::__internal::__device_backend_ template oneapi::dpl::__internal::__difference_t<_Range3> -__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _ExecutionPolicy&& __exec, _Range1&& __keys, - _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, +__parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, const _ExecutionPolicy& __exec, + _Range1&& __keys, _Range2&& __values, _Range3&& __out_keys, _Range4&& __out_values, _BinaryPredicate __binary_pred, _BinaryOperator __binary_op) { // The algorithm reduces values in __values where the From bac5d6ed562ff4be66786197720736e275258453 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 13 Jan 2025 15:31:42 -0600 Subject: [PATCH 18/28] Remove missed std::forward Signed-off-by: Matthew Michel --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index fbe39ce4a9c..a5a4e72d4e0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -2498,7 +2498,7 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, cons } #endif return __parallel_reduce_by_segment_fallback( - oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + oneapi::dpl::__internal::__device_backend_tag{}, __exec, std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values), __binary_pred, __binary_op, oneapi::dpl::unseq_backend::__has_known_identity<_BinaryOperator, __val_type>{}); From 6b9838e4005d4995a4d09229026dc2fc8ee34dcd Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Wed, 22 Jan 2025 15:10:33 -0600 Subject: [PATCH 19/28] Add a broken macro for the generic error code issue Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 2 +- include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 73a7d2e2b24..3db65cabddd 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -855,7 +855,7 @@ struct __bypass_sycl_kernel_not_supported // and rethrow the encountered exception if the two do not compare equal. However, the icpx compiler currently // returns a generic error code in violation of the SYCL spec which has a value of 7. If we are using the Intel // compiler, then compare the value of the error code. Otherwise, assume the implementation is spec compliant. -#ifdef _ONEDPL_LIBSYCL_VERSION // Detects either icpx or the open-source intel/llvm compiler +#if _ONEDPL_ICPX_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN if (__e.code().value() != 7) throw; #else // Generic SYCL compiler. Assume it is spec compliant. diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h index abce0902be1..23c74b5d52c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h @@ -108,6 +108,15 @@ #define _ONEDPL_SYCL_DEVICE_COPYABLE_SPECIALIZATION_BROKEN (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(70100)) +// Macro to check if the exception thrown when a kernel cannot be ran on a device does not align with +// sycl::errc::kernel_not_supported as required by the SYCL spec. Detects the Intel DPC++ and open-source intel/llvm +// compilers. +#ifdef _ONEDPL_LIBSYCL_VERSION +# define _ONEDPL_ICPX_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN 1 +#else +# define _ONEDPL_ICPX_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN 0 +#endif + // Macro to check if we are compiling for SPIR-V devices. This macro must only be used within // SYCL kernels for determining SPIR-V compilation. Using this macro on the host may lead to incorrect behavior. #ifndef _ONEDPL_DETECT_SPIRV_COMPILATION // Check if overridden for testing From a8eb5b5e505aad081795798d1bd56e60d5b9dd3a Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Sun, 26 Jan 2025 16:45:47 -0600 Subject: [PATCH 20/28] Limit broken macro to compilers before 20250200 Signed-off-by: Matthew Michel --- include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h index 23c74b5d52c..8fba3493fb0 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h @@ -111,7 +111,7 @@ // Macro to check if the exception thrown when a kernel cannot be ran on a device does not align with // sycl::errc::kernel_not_supported as required by the SYCL spec. Detects the Intel DPC++ and open-source intel/llvm // compilers. -#ifdef _ONEDPL_LIBSYCL_VERSION +#if defined(_ONEDPL_LIBSYCL_VERSION) && (!defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER < 20250200) # define _ONEDPL_ICPX_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN 1 #else # define _ONEDPL_ICPX_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN 0 From fb0bf334e5e348222dc3e927110cf8d945c6160c Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Sun, 26 Jan 2025 22:01:17 -0600 Subject: [PATCH 21/28] Fix typo Signed-off-by: Matthew Michel --- include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index a5a4e72d4e0..96f96897f84 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1534,7 +1534,7 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, c _Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp, _IsOpDifference __is_op_difference) { -#if _ONEDPL_COMPILER_KERNEL +#if _ONEDPL_COMPILE_KERNEL if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { auto __opt_return = __handle_sync_sycl_exception( From dd4f1c01cd274f7c51f3f79e477bf172fc91e9ef Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Mon, 27 Jan 2025 11:18:09 -0600 Subject: [PATCH 22/28] Define _ONEDPL_SYCL_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN based on LIBSYCL version Signed-off-by: Matthew Michel --- .../dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 2 +- include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 3db65cabddd..6dd08930e61 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -855,7 +855,7 @@ struct __bypass_sycl_kernel_not_supported // and rethrow the encountered exception if the two do not compare equal. However, the icpx compiler currently // returns a generic error code in violation of the SYCL spec which has a value of 7. If we are using the Intel // compiler, then compare the value of the error code. Otherwise, assume the implementation is spec compliant. -#if _ONEDPL_ICPX_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN +#if _ONEDPL_SYCL_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN if (__e.code().value() != 7) throw; #else // Generic SYCL compiler. Assume it is spec compliant. diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h index 8fba3493fb0..09706dba195 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h @@ -110,12 +110,9 @@ // Macro to check if the exception thrown when a kernel cannot be ran on a device does not align with // sycl::errc::kernel_not_supported as required by the SYCL spec. Detects the Intel DPC++ and open-source intel/llvm -// compilers. -#if defined(_ONEDPL_LIBSYCL_VERSION) && (!defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER < 20250200) -# define _ONEDPL_ICPX_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN 1 -#else -# define _ONEDPL_ICPX_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN 0 -#endif +// compilers. No fix has been provided yet, but when the LIBSYCL major version is updated we can re-evaluate if we need +// to extend it to future versions. +#define _ONEDPL_SYCL_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(90000)) // Macro to check if we are compiling for SPIR-V devices. This macro must only be used within // SYCL kernels for determining SPIR-V compilation. Using this macro on the host may lead to incorrect behavior. From 3fcc5ccdd3a9f6a5b8abc6d042bf1ade7d3dcd73 Mon Sep 17 00:00:00 2001 From: Matthew Michel <106704043+mmichel11@users.noreply.github.com> Date: Mon, 27 Jan 2025 18:03:41 -0600 Subject: [PATCH 23/28] Add _ONEDPL_LIBSYCL_PROGRAM_PRESENT checks where appropriate. Co-authored-by: Dmitriy Sobolev --- .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 1c8a3443e00..1f0c72b90a6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -300,7 +300,7 @@ struct __parallel_reduce_then_scan_reduce_submitter __cgh.use_kernel_bundle(__reduce_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_KernelName>( -#if !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __reduce_kernel, #endif __nd_range, [=, *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] { @@ -456,7 +456,7 @@ struct __parallel_reduce_then_scan_scan_submitter __cgh.use_kernel_bundle(__scan_kernel.get_kernel_bundle()); #endif __cgh.parallel_for<_KernelName>( -#if !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT +#if !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT __scan_kernel, #endif __nd_range, [=, *this] (sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] { From 0dda41907d4d5ce0ca9da692e85f2aa832567256 Mon Sep 17 00:00:00 2001 From: Matthew Michel <106704043+mmichel11@users.noreply.github.com> Date: Tue, 28 Jan 2025 08:51:01 -0600 Subject: [PATCH 24/28] Add comment at end of macro guard Co-authored-by: Dmitriy Sobolev --- .../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 1f0c72b90a6..626dae5fbbb 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -877,5 +877,5 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ } // namespace dpl } // namespace oneapi -#endif +#endif // _ONEDPL_COMPILE_KERNEL #endif // _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H From 114832382f4f4f388b28a454826e784a43501798 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 28 Jan 2025 09:36:38 -0600 Subject: [PATCH 25/28] Add comment on compile time constants and kernel naming Signed-off-by: Matthew Michel --- .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 626dae5fbbb..6592d34071b 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -758,6 +758,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ { using _ValueType = typename _InitType::__value_type; using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; + // Note that __sub_group_size and __max_inputs_per_item are not included in kernel names. __sub_group_size + // is always constant (32) and __max_inputs_per_item is directly tied to the input type so these are not + // necessary to obtain a unique kernel name. However, if these compile time variables are adjusted in the + // future, then we need to be careful here to ensure unique kernel naming. using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< __reduce_then_scan_reduce_kernel, _CustomName, _InRng, _OutRng, _GenReduceInput, _ReduceOp, _InitType, _Inclusive, _IsUniquePattern>; From cba9a5765762d49db74b0a73f9e207380ec18192 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 28 Jan 2025 10:19:17 -0600 Subject: [PATCH 26/28] Make sycl::kernel references const Signed-off-by: Matthew Michel --- .../hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 6592d34071b..443fd8c2b86 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -287,7 +287,7 @@ struct __parallel_reduce_then_scan_reduce_submitter operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num, sycl::kernel& __reduce_kernel) const + const std::size_t __block_num, const sycl::kernel& __reduce_kernel) const { using _InitValueType = typename _InitType::__value_type; return __exec.queue().submit([&, this](sycl::handler& __cgh) { @@ -437,7 +437,7 @@ struct __parallel_reduce_then_scan_scan_submitter operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num, sycl::kernel& __scan_kernel) const + const std::size_t __block_num, const sycl::kernel& __scan_kernel) const { std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size}); std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div( @@ -769,8 +769,8 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ __reduce_then_scan_scan_kernel, _CustomName, _InRng, _OutRng, _GenScanInput, _ReduceOp, _ScanInputTransform, _WriteOp, _InitType, _Inclusive, _IsUniquePattern>; static auto __kernels = __internal::__kernel_compiler<_ReduceKernel, _ScanKernel>::__compile(__exec); - sycl::kernel& __reduce_kernel = __kernels[0]; - sycl::kernel& __scan_kernel = __kernels[1]; + const sycl::kernel& __reduce_kernel = __kernels[0]; + const sycl::kernel& __scan_kernel = __kernels[1]; constexpr std::uint8_t __sub_group_size = 32; constexpr std::uint8_t __block_size_scale = std::max(std::size_t{1}, sizeof(double) / sizeof(_ValueType)); From f1d3cccf63262c578c9cf8a957e7b2c4bdc400ba Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 28 Jan 2025 19:11:52 -0600 Subject: [PATCH 27/28] Remove __handle_sync_sycl_exception and directly use try...catch Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 144 +++++++++--------- .../dpcpp/parallel_backend_sycl_utils.h | 40 ++--- 2 files changed, 85 insertions(+), 99 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 96f96897f84..17eb905ecf8 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1134,17 +1134,18 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen using _ScanInputTransform = oneapi::dpl::__internal::__no_op; using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_id; - auto __opt_return = __handle_sync_sycl_exception( - [=, &__exec] { - _GenInput __gen_transform{__unary_op}; - return __parallel_transform_reduce_then_scan( - __backend_tag, __exec, std::move(__in_rng), std::move(__out_rng), __gen_transform, __binary_op, - __gen_transform, _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{}, - /*_IsUniquePattern=*/std::false_type{}); - }, - __bypass_sycl_kernel_not_supported{}); - if (__opt_return) - return __opt_return.value(); + try + { + _GenInput __gen_transform{__unary_op}; + return __parallel_transform_reduce_then_scan(__backend_tag, __exec, __in_rng, __out_rng, + __gen_transform, __binary_op, __gen_transform, + _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{}, + /*_IsUniquePattern=*/std::false_type{}); + } + catch (const sycl::exception& e) + { + __bypass_sycl_kernel_not_supported(e); + } } #endif } @@ -1295,17 +1296,18 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t #if _ONEDPL_COMPILE_KERNEL if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto __opt_return = __handle_sync_sycl_exception( - [=, &__exec] { - using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; - using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>; - return __parallel_reduce_then_scan_copy(__backend_tag, __exec, std::move(__rng), std::move(__result), - __n, _GenMask{__pred}, _WriteOp{_Assign{}}, - /*_IsUniquePattern=*/std::true_type{}); - }, - __bypass_sycl_kernel_not_supported{}); - if (__opt_return) - return __opt_return.value(); + try + { + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; + using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>; + return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __rng, __result, __n, _GenMask{__pred}, + _WriteOp{_Assign{}}, + /*_IsUniquePattern=*/std::true_type{}); + } + catch (const sycl::exception& e) + { + __bypass_sycl_kernel_not_supported(e); + } } #endif using _ReduceOp = std::plus; @@ -1344,8 +1346,8 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ assert(__n > 1); return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), - oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), - oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), + oneapi::dpl::__ranges::zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), + oneapi::dpl::__ranges::zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{}, _WriteOp{__binary_pred, __n}, oneapi::dpl::unseq_backend::__no_init_value>{}, @@ -1362,18 +1364,19 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen #if _ONEDPL_COMPILE_KERNEL if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto __opt_return = __handle_sync_sycl_exception( - [=, &__exec] { - using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; - using _WriteOp = - oneapi::dpl::__par_backend_hetero::__write_to_id_if_else; - return __parallel_reduce_then_scan_copy(__backend_tag, __exec, std::move(__rng), std::move(__result), - __n, _GenMask{__pred}, _WriteOp{}, - /*_IsUniquePattern=*/std::false_type{}); - }, - __bypass_sycl_kernel_not_supported{}); - if (__opt_return) - return __opt_return.value(); + try + { + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; + using _WriteOp = + oneapi::dpl::__par_backend_hetero::__write_to_id_if_else; + return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __rng, __result, __n, _GenMask{__pred}, + _WriteOp{}, + /*_IsUniquePattern=*/std::false_type{}); + } + catch (const sycl::exception& e) + { + __bypass_sycl_kernel_not_supported(e); + } } #endif using _ReduceOp = std::plus; @@ -1418,17 +1421,18 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag, #if _ONEDPL_COMPILE_KERNEL else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto __opt_return = __handle_sync_sycl_exception( - [=, &__exec] { - using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; - using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>; - return __parallel_reduce_then_scan_copy(__backend_tag, __exec, std::move(__in_rng), - std::move(__out_rng), __n, _GenMask{__pred}, _WriteOp{__assign}, - /*_IsUniquePattern=*/std::false_type{}); - }, - __bypass_sycl_kernel_not_supported{}); - if (__opt_return) - return __opt_return.value(); + try + { + using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; + using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>; + return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __in_rng, __out_rng, __n, _GenMask{__pred}, + _WriteOp{__assign}, + /*_IsUniquePattern=*/std::false_type{}); + } + catch (const sycl::exception& e) + { + __bypass_sycl_kernel_not_supported(e); + } } #endif using _ReduceOp = std::plus<_Size>; @@ -1466,7 +1470,7 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __ return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), - oneapi::dpl::__ranges::make_zip_view( + oneapi::dpl::__ranges::zip_view( std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), oneapi::dpl::__ranges::all_view( __mask_buf.get_buffer())), @@ -1537,14 +1541,15 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, c #if _ONEDPL_COMPILE_KERNEL if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto __opt_return = __handle_sync_sycl_exception( - [=, &__exec] { - return __parallel_set_reduce_then_scan(__backend_tag, __exec, std::move(__rng1), std::move(__rng2), - std::move(__result), __comp, __is_op_difference); - }, - __bypass_sycl_kernel_not_supported{}); - if (__opt_return) - return __opt_return.value(); + try + { + return __parallel_set_reduce_then_scan(__backend_tag, __exec, __rng1, __rng2, __result, __comp, + __is_op_difference); + } + catch (const sycl::exception& e) + { + __bypass_sycl_kernel_not_supported(e); + } } #endif return __parallel_set_scan(__backend_tag, __exec, std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), @@ -2481,19 +2486,20 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, cons { if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) { - auto __opt_return = __handle_sync_sycl_exception( - [=, &__exec] { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - oneapi::dpl::__internal::__device_backend_tag{}, __exec, std::move(__keys), std::move(__values), - std::move(__out_keys), std::move(__out_values), __binary_pred, __binary_op); - __res.wait(); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; - }, - __bypass_sycl_kernel_not_supported{}); - if (__opt_return) - return __opt_return.value(); + try + { + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( + oneapi::dpl::__internal::__device_backend_tag{}, __exec, __keys, __values, __out_keys, __out_values, + __binary_pred, __binary_op); + __res.wait(); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; + } + catch (const sycl::exception& e) + { + __bypass_sycl_kernel_not_supported(e); + } } } #endif diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 6dd08930e61..1775e834369 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -21,7 +21,6 @@ #include #include #include -#include #include "../../iterator_impl.h" @@ -846,39 +845,20 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // This exception handler is intended to handle a software workaround by IGC for a hardware bug that // causes IGC to throw an exception for certain integrated graphics devices with -O0 compilation and // a required sub-group size of 32. -struct __bypass_sycl_kernel_not_supported +void +__bypass_sycl_kernel_not_supported(const sycl::exception& __e) { - void - operator()(const sycl::exception& __e) const - { - // The SYCL spec compliant solution would be to compare __e.code() and sycl::errc::kernel_not_supported - // and rethrow the encountered exception if the two do not compare equal. However, the icpx compiler currently - // returns a generic error code in violation of the SYCL spec which has a value of 7. If we are using the Intel - // compiler, then compare the value of the error code. Otherwise, assume the implementation is spec compliant. + // The SYCL spec compliant solution would be to compare __e.code() and sycl::errc::kernel_not_supported + // and rethrow the encountered exception if the two do not compare equal. However, the icpx compiler currently + // returns a generic error code in violation of the SYCL spec which has a value of 7. If we are using the Intel + // compiler, then compare the value of the error code. Otherwise, assume the implementation is spec compliant. #if _ONEDPL_SYCL_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN - if (__e.code().value() != 7) - throw; + if (__e.code().value() != 7) + throw; #else // Generic SYCL compiler. Assume it is spec compliant. - if (__e.code() != sycl::errc::kernel_not_supported) - throw; + if (__e.code() != sycl::errc::kernel_not_supported) + throw; #endif - } -}; - -template -auto -__handle_sync_sycl_exception(_Callable __caller, _Handler __handler) -> std::optional -{ - try - { - return __caller(); - } - catch (const sycl::exception& __e) - { - // Handle the error and return an empty std::optional - __handler(__e); - return {}; - } } } // namespace __par_backend_hetero From b49dd0ea8a396fb92566b8c03f91a751062d7786 Mon Sep 17 00:00:00 2001 From: Matthew Michel Date: Tue, 28 Jan 2025 22:00:31 -0600 Subject: [PATCH 28/28] Make __bypass_sycl_kernel_not_supported inline to allow across multiple translation units Signed-off-by: Matthew Michel --- .../oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 1775e834369..b00dc31abe6 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -845,7 +845,7 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, // This exception handler is intended to handle a software workaround by IGC for a hardware bug that // causes IGC to throw an exception for certain integrated graphics devices with -O0 compilation and // a required sub-group size of 32. -void +inline void __bypass_sycl_kernel_not_supported(const sycl::exception& __e) { // The SYCL spec compliant solution would be to compare __e.code() and sycl::errc::kernel_not_supported