Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stream support for Gauss-Seidel: Symbolic, Numeric, Apply (Twostage) #1980

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 38 additions & 41 deletions blas/impl/KokkosBlas2_gemv_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,9 @@ struct SingleLevelTransposeGEMV {
};

// Single-level parallel version of GEMV.
template <class AViewType, class XViewType, class YViewType,
class IndexType = typename AViewType::size_type>
void singleLevelGemv(const typename AViewType::execution_space& space,
const char trans[],
template <class ExecutionSpace, class AViewType, class XViewType,
class YViewType, class IndexType = typename AViewType::size_type>
void singleLevelGemv(const ExecutionSpace& space, const char trans[],
typename AViewType::const_value_type& alpha,
const AViewType& A, const XViewType& x,
typename YViewType::const_value_type& beta,
Expand All @@ -222,9 +221,8 @@ void singleLevelGemv(const typename AViewType::execution_space& space,
static_assert(std::is_integral<IndexType>::value,
"IndexType must be an integer");

using y_value_type = typename YViewType::non_const_value_type;
using execution_space = typename AViewType::execution_space;
using policy_type = Kokkos::RangePolicy<execution_space, IndexType>;
using y_value_type = typename YViewType::non_const_value_type;
using policy_type = Kokkos::RangePolicy<ExecutionSpace, IndexType>;

using AlphaCoeffType = typename AViewType::non_const_value_type;
using BetaCoeffType = typename YViewType::non_const_value_type;
Expand Down Expand Up @@ -442,8 +440,8 @@ struct TwoLevelGEMV_LayoutRightTag {};
// ---------------------------------------------------------------------------------------------
// Functor for a two-level parallel_reduce version of GEMV (non-transpose),
// designed for performance on GPU. Kernel depends on the layout of A.
template <class AViewType, class XViewType, class YViewType,
class IndexType = typename AViewType::size_type>
template <class ExecutionSpace, class AViewType, class XViewType,
class YViewType, class IndexType = typename AViewType::size_type>
struct TwoLevelGEMV {
using y_value_type = typename YViewType::non_const_value_type;
using AlphaCoeffType = typename AViewType::non_const_value_type;
Expand All @@ -453,9 +451,8 @@ struct TwoLevelGEMV {
std::is_same<y_value_type, Kokkos::Experimental::bhalf_t>::value,
float, y_value_type>::type;

using execution_space = typename AViewType::execution_space;
using policy_type = Kokkos::TeamPolicy<execution_space>;
using member_type = typename policy_type::member_type;
using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
using member_type = typename policy_type::member_type;

TwoLevelGEMV(const AlphaCoeffType& alpha, const AViewType& A,
const XViewType& x, const BetaCoeffType& beta,
Expand Down Expand Up @@ -564,7 +561,8 @@ struct TwoLevelGEMV {
// transpose GEMV. The functor uses parallel-for over the columns of the input
// matrix A and each team uses parallel-reduce over the row of its column.
// The output vector y is the reduction result.
template <class AViewType, class XViewType, class YViewType, const bool conj,
template <class ExecutionSpace, class AViewType, class XViewType,
class YViewType, const bool conj,
class IndexType = typename AViewType::size_type>
struct TwoLevelTransposeGEMV {
using y_value_type = typename YViewType::non_const_value_type;
Expand All @@ -575,9 +573,8 @@ struct TwoLevelTransposeGEMV {
std::is_same<y_value_type, Kokkos::Experimental::bhalf_t>::value,
float, y_value_type>::type;

using execution_space = typename AViewType::execution_space;
using policy_type = Kokkos::TeamPolicy<execution_space>;
using member_type = typename policy_type::member_type;
using policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
using member_type = typename policy_type::member_type;

TwoLevelTransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A,
const XViewType& x, const BetaCoeffType& beta,
Expand Down Expand Up @@ -637,10 +634,9 @@ struct TwoLevelTransposeGEMV {
};

// Two-level parallel version of GEMV.
template <class AViewType, class XViewType, class YViewType,
class IndexType = typename AViewType::size_type>
void twoLevelGemv(const typename AViewType::execution_space& space,
const char trans[],
template <class ExecutionSpace, class AViewType, class XViewType,
class YViewType, class IndexType = typename AViewType::size_type>
void twoLevelGemv(const ExecutionSpace& space, const char trans[],
typename AViewType::const_value_type& alpha,
const AViewType& A, const XViewType& x,
typename YViewType::const_value_type& beta,
Expand All @@ -661,9 +657,8 @@ void twoLevelGemv(const typename AViewType::execution_space& space,
"IndexType must be an integer");

using y_value_type = typename YViewType::non_const_value_type;
using execution_space = typename AViewType::execution_space;
using team_policy_type = Kokkos::TeamPolicy<execution_space>;
using range_policy_type = Kokkos::RangePolicy<execution_space, IndexType>;
using team_policy_type = Kokkos::TeamPolicy<ExecutionSpace>;
using range_policy_type = Kokkos::RangePolicy<ExecutionSpace, IndexType>;

using Kokkos::ArithTraits;
using KAT = ArithTraits<typename AViewType::non_const_value_type>;
Expand Down Expand Up @@ -704,19 +699,19 @@ void twoLevelGemv(const typename AViewType::execution_space& space,
using layout_tag =
typename std::conditional<isLayoutLeft, TwoLevelGEMV_LayoutLeftTag,
TwoLevelGEMV_LayoutRightTag>::type;
using tagged_policy = Kokkos::TeamPolicy<execution_space, layout_tag>;
using functor_type =
TwoLevelGEMV<AViewType, XViewType, YViewType, IndexType>;
using tagged_policy = Kokkos::TeamPolicy<ExecutionSpace, layout_tag>;
using functor_type = TwoLevelGEMV<ExecutionSpace, AViewType, XViewType,
YViewType, IndexType>;
functor_type functor(alpha, A, x, beta, y);
tagged_policy team;
if (isLayoutLeft) {
if constexpr (isLayoutLeft) {
using AccumScalar = typename std::conditional<
std::is_same<y_value_type, Kokkos::Experimental::half_t>::value ||
std::is_same<y_value_type, Kokkos::Experimental::bhalf_t>::value,
float, y_value_type>::type;
size_t sharedPerTeam = 32 * sizeof(AccumScalar);
IndexType numTeams = (A.extent(0) + 31) / 32;
tagged_policy temp(1, 1);
tagged_policy temp(space, 1, 1);
temp.set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam));
int teamSize =
temp.team_size_recommended(functor, Kokkos::ParallelForTag());
Expand All @@ -727,7 +722,7 @@ void twoLevelGemv(const typename AViewType::execution_space& space,
// FIXME SYCL: team_size_recommended() returns too big of a team size.
// Kernel hangs with 1024 threads on XEHP.
#ifdef KOKKOS_ENABLE_SYCL
if (std::is_same<execution_space, Kokkos::Experimental::SYCL>::value) {
if (std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value) {
if (teamSize > 256) teamSize = 256;
}
#endif
Expand All @@ -749,16 +744,18 @@ void twoLevelGemv(const typename AViewType::execution_space& space,
} else if (tr == 'T') {
// transpose, and not conj transpose
team_policy_type team(space, A.extent(1), Kokkos::AUTO);
using functor_type = TwoLevelTransposeGEMV<AViewType, XViewType,
YViewType, false, IndexType>;
using functor_type =
TwoLevelTransposeGEMV<ExecutionSpace, AViewType, XViewType, YViewType,
false, IndexType>;
functor_type functor(alpha, A, x, beta, y);
Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team,
functor);
} else if (tr == 'C' || tr == 'H') {
// conjugate transpose
team_policy_type team(space, A.extent(1), Kokkos::AUTO);
using functor_type = TwoLevelTransposeGEMV<AViewType, XViewType,
YViewType, true, IndexType>;
using functor_type =
TwoLevelTransposeGEMV<ExecutionSpace, AViewType, XViewType, YViewType,
true, IndexType>;
functor_type functor(alpha, A, x, beta, y);
Kokkos::parallel_for("KokkosBlas::gemv[twoLevelTranspose]", team,
functor);
Expand All @@ -769,23 +766,23 @@ void twoLevelGemv(const typename AViewType::execution_space& space,
// generalGemv: use 1 level (Range) or 2 level (Team) implementation,
// depending on whether execution space is CPU or GPU. enable_if makes sure
// unused kernels are not instantiated.
template <class AViewType, class XViewType, class YViewType, class IndexType,
template <class ExecutionSpace, class AViewType, class XViewType,
class YViewType, class IndexType,
typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
typename AViewType::execution_space>()>::type* = nullptr>
void generalGemvImpl(const typename AViewType::execution_space& space,
const char trans[],
ExecutionSpace>()>::type* = nullptr>
void generalGemvImpl(const ExecutionSpace& space, const char trans[],
typename AViewType::const_value_type& alpha,
const AViewType& A, const XViewType& x,
typename YViewType::const_value_type& beta,
const YViewType& y) {
singleLevelGemv(space, trans, alpha, A, x, beta, y);
}

template <class AViewType, class XViewType, class YViewType, class IndexType,
template <class ExecutionSpace, class AViewType, class XViewType,
class YViewType, class IndexType,
typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
typename AViewType::execution_space>()>::type* = nullptr>
void generalGemvImpl(const typename AViewType::execution_space& space,
const char trans[],
ExecutionSpace>()>::type* = nullptr>
void generalGemvImpl(const ExecutionSpace& space, const char trans[],
typename AViewType::const_value_type& alpha,
const AViewType& A, const XViewType& x,
typename YViewType::const_value_type& beta,
Expand Down
6 changes: 3 additions & 3 deletions blas/impl/KokkosBlas2_gemv_spec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ struct GEMV {
// Prefer int as the index type, but use a larger type if needed.
if (numRows < static_cast<size_type>(INT_MAX) &&
numCols < static_cast<size_type>(INT_MAX)) {
generalGemvImpl<AViewType, XViewType, YViewType, int>(space, trans, alpha,
A, x, beta, y);
generalGemvImpl<ExecutionSpace, AViewType, XViewType, YViewType, int>(
space, trans, alpha, A, x, beta, y);
} else {
generalGemvImpl<AViewType, XViewType, YViewType, int64_t>(
generalGemvImpl<ExecutionSpace, AViewType, XViewType, YViewType, int64_t>(
space, trans, alpha, A, x, beta, y);
}
Kokkos::Profiling::popRegion();
Expand Down
5 changes: 2 additions & 3 deletions common/src/KokkosKernels_Utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -890,7 +890,7 @@ void permute_block_vector(typename idx_array_type::value_type num_elements,
// TODO BMK: clean this up by removing 1st argument. It is unused but
// its name gives the impression that only num_elements of the vector are
// zeroed, when really it's always the whole thing.
template <class ExecSpaceIn, typename value_array_type, typename MyExecSpace>
template <class ExecSpaceIn, typename value_array_type>
void zero_vector(ExecSpaceIn &exec_space_in,
typename value_array_type::value_type /* num_elements */,
value_array_type &vector) {
Expand All @@ -906,8 +906,7 @@ void zero_vector(typename value_array_type::value_type /* num_elements */,
using ne_tmp_t = typename value_array_type::value_type;
ne_tmp_t ne_tmp = ne_tmp_t(0);
MyExecSpace my_exec_space;
zero_vector<MyExecSpace, value_array_type, MyExecSpace>(my_exec_space, ne_tmp,
vector);
zero_vector(my_exec_space, ne_tmp, vector);
}

template <typename v1, typename v2, typename v3>
Expand Down
13 changes: 13 additions & 0 deletions docs/developer/apidocs/sparse.rst
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,16 @@ par_ilut
gmres
-----
.. doxygenfunction:: gmres(KernelHandle* handle, AMatrix& A, BType& B, XType& X, Preconditioner<AMatrix>* precond)

sptrsv
------
.. doxygenfunction:: sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries)
.. doxygenfunction:: sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries)
.. doxygenfunction:: sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values)
.. doxygenfunction:: sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values)
.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, XType x)
.. doxygenfunction:: sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, XType x)
.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, XType x, XType b)
.. doxygenfunction:: sptrsv_solve(KernelHandle *handle, XType x, XType b)
.. doxygenfunction:: sptrsv_solve(ExecutionSpace &space, KernelHandle *handleL, KernelHandle *handleU, XType x, XType b)
.. doxygenfunction:: sptrsv_solve(KernelHandle *handleL, KernelHandle *handleU, XType x, XType b)
10 changes: 4 additions & 6 deletions sparse/impl/KokkosSparse_gauss_seidel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1547,9 +1547,8 @@ class PointGaussSeidel {
Permuted_Yvector);
}
if (init_zero_x_vector) {
KokkosKernels::Impl::zero_vector<
MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>(
my_exec_space, num_cols * block_size, Permuted_Xvector);
KokkosKernels::Impl::zero_vector(my_exec_space, num_cols * block_size,
Permuted_Xvector);
} else {
KokkosKernels::Impl::permute_block_vector<
x_value_array_type, scalar_persistent_work_view2d_t,
Expand Down Expand Up @@ -1664,9 +1663,8 @@ class PointGaussSeidel {
Permuted_Yvector);
}
if (init_zero_x_vector) {
KokkosKernels::Impl::zero_vector<
MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>(
my_exec_space, num_cols, Permuted_Xvector);
KokkosKernels::Impl::zero_vector(my_exec_space, num_cols,
Permuted_Xvector);
} else {
KokkosKernels::Impl::permute_vector<
x_value_array_type, scalar_persistent_work_view2d_t,
Expand Down
Loading