Skip to content

Commit

Permalink
Remove sub-group size requirements in one-wg radix sort entirely
Browse files Browse the repository at this point in the history
Signed-off-by: Matthew Michel <matthew.michel@intel.com>
  • Loading branch information
mmichel11 committed Sep 10, 2024
1 parent c5b799e commit 7572f84
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -816,11 +816,9 @@ __parallel_radix_sort(oneapi::dpl::__internal::__device_backend_tag, _ExecutionP
else if (__n <= 4096 && __wg_size * 4 <= __max_wg_size)
__event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 4, 16, __radix_bits, __is_ascending>{}(
__exec.queue(), ::std::forward<_Range>(__in_rng), __proj);
// In __subgroup_radix_sort, we request a sub-group size via _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED
// based upon the iters per item. For the below cases, register spills that result in runtime exceptions have
// been observed on accelerators that do not support the requested sub-group size of 16. For the above cases
// that request but may not receive a sub-group size of 16, inputs are small enough to avoid register
// spills on assessed hardware.
// For the below cases, register spills that result in runtime exceptions have been observed on accelerators
// that do not support the requested sub-group size of 16. For the above cases, inputs are small enough to avoid
// register spills on assessed hardware.
else if (__n <= 8192 && __wg_size * 8 <= __max_wg_size && __dev_has_sg16)
__event = __subgroup_radix_sort<_RadixSortKernel, __wg_size * 8, 16, __radix_bits, __is_ascending>{}(
__exec.queue(), ::std::forward<_Range>(__in_rng), __proj);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,6 @@ struct __subgroup_radix_sort
auto
operator()(sycl::queue __q, _RangeIn&& __src, _Proj __proj, _SLM_tag_val, _SLM_counter)
{
constexpr std::uint16_t __req_sub_group_size = 16;
uint16_t __n = __src.size();
assert(__n <= __block_size * __wg_size);

Expand All @@ -165,7 +164,7 @@ struct __subgroup_radix_sort

__cgh.parallel_for<_Name...>(
__range,
([=](sycl::nd_item<1> __it)[[_ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED(__req_sub_group_size)]] {
([=](sycl::nd_item<1> __it) {
union __storage { _ValT __v[__block_size]; __storage(){} } __values;
uint16_t __wi = __it.get_local_linear_id();
uint16_t __begin_bit = 0;
Expand Down
9 changes: 0 additions & 9 deletions include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,15 +77,6 @@
# define _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE(SIZE) intel::reqd_sub_group_size(SIZE)
#endif

// This macro is intended to be used for specifying a subgroup size as a SYCL kernel attribute for SPIR-V targets
// only. For non-SPIR-V targets, it will be empty. This macro should only be used in device code and may lead
// to incorrect behavior if used on the host.
#if _ONEDPL_DETECT_SPIRV_COMPILATION
# define _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED(SIZE) _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE(SIZE)
#else
# define _ONEDPL_SYCL_REQD_SUB_GROUP_SIZE_IF_SUPPORTED(SIZE)
#endif

// The unified future supporting USM memory and buffers is only supported after DPCPP 2023.1
// but not by 2023.2.
#if (_ONEDPL_LIBSYCL_VERSION >= 60100 && _ONEDPL_LIBSYCL_VERSION != 60200)
Expand Down

0 comments on commit 7572f84

Please sign in to comment.