Skip to content

Commit

Permalink
Merge Compatibility to SYCL 2020
Browse files Browse the repository at this point in the history
This PR makes our DPC++ code compatible with SYCL 2020 by setting subgroup sizes explicitly
in kernel lambdas. Additionally, it fixes a few deprecation warnings from SYCL and GTest.

Related PR: #943
  • Loading branch information
upsj authored Feb 12, 2022
2 parents a9608ea + 9db55b6 commit 5efed26
Show file tree
Hide file tree
Showing 17 changed files with 169 additions and 210 deletions.
3 changes: 1 addition & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ if(MSVC)
elseif(GINKGO_BUILD_DPCPP OR CMAKE_CXX_COMPILER MATCHES "dpcpp")
# For now always use `-ffp-model=precise` with DPC++. This can be removed when
# the floating point issues are fixed.
# -sycl-std=1.2.1 (or -sycl-std=2017) is temporary workaround after 2021.4 to propagate subgroup setting correctly
set(GINKGO_COMPILER_FLAGS "-Wpedantic;-ffp-model=precise;-sycl-std=1.2.1" CACHE STRING
set(GINKGO_COMPILER_FLAGS "-Wpedantic;-ffp-model=precise" CACHE STRING
"Set the required CXX compiler flags, mainly used for warnings. Current default is `-Wpedantic;-ffp-model=precise`")
else()
set(GINKGO_COMPILER_FLAGS "-Wpedantic" CACHE STRING
Expand Down
2 changes: 1 addition & 1 deletion core/test/solver/multigrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ class Multigrid : public ::testing::Test {
}
};

TYPED_TEST_CASE(Multigrid, gko::test::ValueTypes, TypenameNameGenerator);
TYPED_TEST_SUITE(Multigrid, gko::test::ValueTypes, TypenameNameGenerator);


TYPED_TEST(Multigrid, MultigridFactoryKnowsItsExecutor)
Expand Down
2 changes: 1 addition & 1 deletion dpcpp/base/executor.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ void DpcppExecutor::set_device_property()
for (auto& i : subgroup_sizes) {
this->get_exec_info().subgroup_sizes.push_back(i);
}
} catch (cl::sycl::runtime_error& err) {
} catch (cl::sycl::exception& err) {
GKO_NOT_SUPPORTED(device);
}
}
Expand Down
48 changes: 30 additions & 18 deletions dpcpp/base/helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
InferredArgs... args) \
{ \
queue->submit([&](sycl::handler& cgh) { \
cgh.parallel_for(sycl_nd_range(grid, block), \
[=](sycl::nd_item<3> item_ct1) { \
kernel_(args..., item_ct1); \
}); \
cgh.parallel_for( \
sycl_nd_range(grid, block), [= \
](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( \
config::warp_size)]] { \
kernel_(args..., item_ct1); \
}); \
}); \
}

Expand All @@ -78,17 +80,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* @param name_ the name of the host function with config
* @param kernel_ the kernel name
*/
#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \
template <std::uint32_t encoded, typename... InferredArgs> \
inline void name_(dim3 grid, dim3 block, gko::size_type, \
sycl::queue* queue, InferredArgs... args) \
{ \
queue->submit([&](sycl::handler& cgh) { \
cgh.parallel_for(sycl_nd_range(grid, block), \
[=](sycl::nd_item<3> item_ct1) { \
kernel_<encoded>(args..., item_ct1); \
}); \
}); \
#define GKO_ENABLE_DEFAULT_HOST_CONFIG(name_, kernel_) \
template <std::uint32_t encoded, typename... InferredArgs> \
inline void name_(dim3 grid, dim3 block, gko::size_type, \
sycl::queue* queue, InferredArgs... args) \
{ \
queue->submit([&](sycl::handler& cgh) { \
if constexpr (KCFG_1D::decode<1>(encoded) > 1) { \
cgh.parallel_for( \
sycl_nd_range(grid, block), [= \
](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size( \
KCFG_1D::decode<1>( \
encoded))]] { \
kernel_<encoded>(args..., item_ct1); \
}); \
} else { \
cgh.parallel_for(sycl_nd_range(grid, block), \
[=](sycl::nd_item<3> item_ct1) { \
kernel_<encoded>(args..., item_ct1); \
}); \
} \
}); \
}

/**
Expand Down Expand Up @@ -119,9 +131,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}

// __WG_BOUND__ gives the cuda-like launch bound in cuda ordering
#define __WG_BOUND_1D__(x) [[intel::reqd_work_group_size(1, 1, x)]]
#define __WG_BOUND_2D__(x, y) [[intel::reqd_work_group_size(1, y, x)]]
#define __WG_BOUND_3D__(x, y, z) [[intel::reqd_work_group_size(z, y, x)]]
#define __WG_BOUND_1D__(x) [[sycl::reqd_work_group_size(1, 1, x)]]
#define __WG_BOUND_2D__(x, y) [[sycl::reqd_work_group_size(1, y, x)]]
#define __WG_BOUND_3D__(x, y, z) [[sycl::reqd_work_group_size(z, y, x)]]
#define WG_BOUND_OVERLOAD(_1, _2, _3, NAME, ...) NAME
#define __WG_BOUND__(...) \
WG_BOUND_OVERLOAD(__VA_ARGS__, __WG_BOUND_3D__, __WG_BOUND_2D__, \
Expand Down
12 changes: 5 additions & 7 deletions dpcpp/base/kernel_launch_reduction.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ void generic_kernel_reduction_1d(sycl::handler& cgh, int64 size,

cgh.parallel_for(
range, [=
](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] {
](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] {
auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0];
const auto tidx = thread::get_thread_id_flat<int64>(idx);
const auto local_tidx = static_cast<int64>(tidx % wg_size);
Expand Down Expand Up @@ -130,7 +130,7 @@ void generic_kernel_reduction_2d(sycl::handler& cgh, int64 rows, int64 cols,

cgh.parallel_for(
range, [=
](sycl::nd_item<3> idx) [[intel::reqd_sub_group_size(sg_size)]] {
](sycl::nd_item<3> idx) [[sycl::reqd_sub_group_size(sg_size)]] {
auto subgroup_partial = &(*subgroup_partial_acc.get_pointer())[0];
const auto tidx = thread::get_thread_id_flat<int64>(idx);
const auto local_tidx = static_cast<int64>(tidx % wg_size);
Expand Down Expand Up @@ -312,7 +312,7 @@ void generic_kernel_row_reduction_2d(syn::value_list<int, ssg_size>,
exec->get_queue()->submit([&](sycl::handler& cgh) {
cgh.parallel_for(
range, [=
](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] {
const auto idx =
thread::get_subwarp_id_flat<ssg_size, int64>(id);
const auto row = idx % rows;
Expand Down Expand Up @@ -368,8 +368,7 @@ void generic_kernel_col_reduction_2d_small(
block_partial_acc(cgh);
const auto range = sycl_nd_range(dim3(row_blocks), dim3(wg_size));
cgh.parallel_for(
range, [=
](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] {
auto block_partial = &(*block_partial_acc.get_pointer())[0];
const auto ssg_id =
thread::get_subwarp_id_flat<ssg_size, int64>(id);
Expand Down Expand Up @@ -441,8 +440,7 @@ void generic_kernel_col_reduction_2d_blocked(
sycl::access_mode::read_write, sycl::access::target::local>
block_partial_acc(cgh);
cgh.parallel_for(
range, [=
](sycl::nd_item<3> id) [[intel::reqd_sub_group_size(sg_size)]] {
range, [=](sycl::nd_item<3> id) [[sycl::reqd_sub_group_size(sg_size)]] {
const auto sg_id = thread::get_subwarp_id_flat<sg_size, int64>(id);
const auto sg_num =
thread::get_subwarp_num_flat<sg_size, int64>(id);
Expand Down
15 changes: 7 additions & 8 deletions dpcpp/components/cooperative_groups.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ namespace detail {
* This is a limited implementation of the DPCPP thread_block_tile.
*/
template <unsigned Size>
class thread_block_tile : public sycl::ONEAPI::sub_group {
using sub_group = sycl::ONEAPI::sub_group;
class thread_block_tile : public sycl::sub_group {
using sub_group = sycl::sub_group;
using id_type = sub_group::id_type;
using mask_type = config::lane_mask_type;

Expand Down Expand Up @@ -236,9 +236,9 @@ class thread_block_tile : public sycl::ONEAPI::sub_group {
__dpct_inline__ mask_type ballot(int predicate) const noexcept
{
// todo: change it when OneAPI update the mask related api
return sycl::ONEAPI::reduce(
return sycl::reduce_over_group(
*this, (predicate != 0) ? mask_type(1) << data_.rank : mask_type(0),
sycl::ONEAPI::plus<mask_type>());
sycl::plus<mask_type>());
}

/**
Expand All @@ -247,7 +247,7 @@ class thread_block_tile : public sycl::ONEAPI::sub_group {
*/
__dpct_inline__ bool any(int predicate) const noexcept
{
return sycl::ONEAPI::any_of(*this, (predicate != 0));
return sycl::any_of_group(*this, (predicate != 0));
}

/**
Expand All @@ -256,7 +256,7 @@ class thread_block_tile : public sycl::ONEAPI::sub_group {
*/
__dpct_inline__ bool all(int predicate) const noexcept
{
return sycl::ONEAPI::all_of(*this, (predicate != 0));
return sycl::all_of_group(*this, (predicate != 0));
}


Expand Down Expand Up @@ -353,8 +353,7 @@ template <unsigned Size, typename Group>
__dpct_inline__
std::enable_if_t<(Size > 1) && Size <= 64 && !(Size & (Size - 1)),
detail::thread_block_tile<Size>>
tiled_partition
[[intel::reqd_sub_group_size(Size)]] (const Group& group)
tiled_partition(const Group& group)
{
return detail::thread_block_tile<Size>(group);
}
Expand Down
4 changes: 3 additions & 1 deletion dpcpp/components/reduction.dp.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,9 @@ void reduce_add_array(dim3 grid, dim3 block, size_type dynamic_shared_memory,
block_sum_acc_ct1(cgh);

cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
sycl_nd_range(grid, block), [=
](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
KCFG_1D::decode<1>(cfg))]] {
reduce_add_array<cfg>(size, source, result, item_ct1,
*block_sum_acc_ct1.get_pointer());
});
Expand Down
61 changes: 10 additions & 51 deletions dpcpp/matrix/csr_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,9 @@ void abstract_classical_spmv(dim3 grid, dim3 block,
{
queue->submit([&](sycl::handler& cgh) {
cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
sycl_nd_range(grid, block), [=
](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(
subgroup_size)]] {
abstract_classical_spmv<subgroup_size>(num_rows, val, col_idxs,
row_ptrs, b, b_stride, c,
c_stride, item_ct1);
Expand Down Expand Up @@ -744,52 +746,6 @@ void fill_in_dense(size_type num_rows, const IndexType* __restrict__ row_ptrs,
GKO_ENABLE_DEFAULT_HOST(fill_in_dense, fill_in_dense);


template <typename IndexType>
void calculate_nnz_per_row(size_type num_rows,
const IndexType* __restrict__ row_ptrs,
size_type* __restrict__ nnz_per_row,
sycl::nd_item<3> item_ct1)
{
const auto tidx = thread::get_thread_id_flat(item_ct1);
if (tidx < num_rows) {
nnz_per_row[tidx] = row_ptrs[tidx + 1] - row_ptrs[tidx];
}
}

GKO_ENABLE_DEFAULT_HOST(calculate_nnz_per_row, calculate_nnz_per_row);


void reduce_max_nnz(size_type size, const size_type* __restrict__ nnz_per_row,
size_type* __restrict__ result, sycl::nd_item<3> item_ct1,
size_type* block_max)
{
reduce_array(
size, nnz_per_row, block_max, item_ct1,
[](const size_type& x, const size_type& y) { return max(x, y); });

if (item_ct1.get_local_id(2) == 0) {
result[item_ct1.get_group(2)] = block_max[0];
}
}

void reduce_max_nnz(dim3 grid, dim3 block, size_type dynamic_shared_memory,
sycl::queue* queue, size_type size,
const size_type* nnz_per_row, size_type* result)
{
queue->submit([&](sycl::handler& cgh) {
sycl::accessor<size_type, 1, sycl::access_mode::read_write,
sycl::access::target::local>
block_max_acc_ct1(sycl::range<1>(default_block_size), cgh);

cgh.parallel_for(sycl_nd_range(grid, block),
[=](sycl::nd_item<3> item_ct1) {
reduce_max_nnz(size, nnz_per_row, result, item_ct1,
block_max_acc_ct1.get_pointer());
});
});
}


template <typename IndexType>
void check_unsorted(const IndexType* __restrict__ row_ptrs,
const IndexType* __restrict__ col_idxs, IndexType num_rows,
Expand Down Expand Up @@ -1231,9 +1187,10 @@ void spmv(std::shared_ptr<const DpcppExecutor> exec,
zero<ValueType>(), c->get_values());
} else {
oneapi::mkl::sparse::gemm(
*exec->get_queue(), oneapi::mkl::transpose::nontrans,
one<ValueType>(), mat_handle,
const_cast<ValueType*>(b->get_const_values()),
*exec->get_queue(), oneapi::mkl::layout::row_major,
oneapi::mkl::transpose::nontrans,
oneapi::mkl::transpose::nontrans, one<ValueType>(),
mat_handle, const_cast<ValueType*>(b->get_const_values()),
b->get_size()[1], b->get_stride(), zero<ValueType>(),
c->get_values(), c->get_stride());
}
Expand Down Expand Up @@ -1296,7 +1253,9 @@ void advanced_spmv(std::shared_ptr<const DpcppExecutor> exec,
c->get_values());
} else {
oneapi::mkl::sparse::gemm(
*exec->get_queue(), oneapi::mkl::transpose::nontrans,
*exec->get_queue(), oneapi::mkl::layout::row_major,
oneapi::mkl::transpose::nontrans,
oneapi::mkl::transpose::nontrans,
exec->copy_val_to_host(alpha->get_const_values()),
mat_handle, const_cast<ValueType*>(b->get_const_values()),
b->get_size()[1], b->get_stride(),
Expand Down
8 changes: 4 additions & 4 deletions dpcpp/matrix/dense_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ void transpose(const size_type nrows, const size_type ncols,
}

template <std::uint32_t sg_size, typename ValueType>
__WG_BOUND__(sg_size, sg_size)
void transpose(const size_type nrows, const size_type ncols,
const ValueType* __restrict__ in, const size_type in_stride,
ValueType* __restrict__ out, const size_type out_stride,
Expand All @@ -134,7 +133,8 @@ void transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory,
space_acc_ct1(cgh);

cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
sycl_nd_range(grid, block),
[=](sycl::nd_item<3> item_ct1) __WG_BOUND__(sg_size, sg_size) {
transpose<sg_size>(nrows, ncols, in, in_stride, out, out_stride,
item_ct1, *space_acc_ct1.get_pointer());
});
Expand All @@ -146,7 +146,6 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(transpose_call, transpose, subgroup_list);


template <std::uint32_t sg_size, typename ValueType>
__WG_BOUND__(sg_size, sg_size)
void conj_transpose(const size_type nrows, const size_type ncols,
const ValueType* __restrict__ in, const size_type in_stride,
ValueType* __restrict__ out, const size_type out_stride,
Expand All @@ -172,7 +171,8 @@ void conj_transpose(dim3 grid, dim3 block, size_type dynamic_shared_memory,
space_acc_ct1(cgh);

cgh.parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
sycl_nd_range(grid, block),
[=](sycl::nd_item<3> item_ct1) __WG_BOUND__(sg_size, sg_size) {
conj_transpose<sg_size>(nrows, ncols, in, in_stride, out,
out_stride, item_ct1,
*space_acc_ct1.get_pointer());
Expand Down
Loading

0 comments on commit 5efed26

Please sign in to comment.