Skip to content

Commit

Permalink
Merge: Use vendor implementations for some Dense kernels in Ginkgo
Browse files Browse the repository at this point in the history
This PR uses the vendor implementations for some single vector dense kernels such as dot, conj_dot and norm2, because we expect the vendor implementation to be the fastest for these implementations.

Related PR: #967
  • Loading branch information
pratikvn authored Feb 14, 2022
2 parents 5efed26 + 3ea6e0c commit b34ae57
Show file tree
Hide file tree
Showing 12 changed files with 393 additions and 48 deletions.
3 changes: 3 additions & 0 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,11 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
Expand Down
13 changes: 9 additions & 4 deletions core/matrix/dense.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,12 @@ GKO_REGISTER_OPERATION(sub_scaled, dense::sub_scaled);
GKO_REGISTER_OPERATION(add_scaled_diag, dense::add_scaled_diag);
GKO_REGISTER_OPERATION(sub_scaled_diag, dense::sub_scaled_diag);
GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot);
GKO_REGISTER_OPERATION(compute_dot_dispatch, dense::compute_dot_dispatch);
GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot);
GKO_REGISTER_OPERATION(compute_conj_dot_dispatch,
dense::compute_conj_dot_dispatch);
GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2);
GKO_REGISTER_OPERATION(compute_norm2_dispatch, dense::compute_norm2_dispatch);
GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1);
GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row);
GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs,
Expand Down Expand Up @@ -272,7 +276,8 @@ void Dense<ValueType>::compute_dot_impl(const LinOp* b, LinOp* result) const
auto exec = this->get_executor();
auto dense_b = make_temporary_conversion<ValueType>(b);
auto dense_res = make_temporary_conversion<ValueType>(result);
exec->run(dense::make_compute_dot(this, dense_b.get(), dense_res.get()));
exec->run(
dense::make_compute_dot_dispatch(this, dense_b.get(), dense_res.get()));
}


Expand All @@ -285,8 +290,8 @@ void Dense<ValueType>::compute_conj_dot_impl(const LinOp* b,
auto exec = this->get_executor();
auto dense_b = make_temporary_conversion<ValueType>(b);
auto dense_res = make_temporary_conversion<ValueType>(result);
exec->run(
dense::make_compute_conj_dot(this, dense_b.get(), dense_res.get()));
exec->run(dense::make_compute_conj_dot_dispatch(this, dense_b.get(),
dense_res.get()));
}


Expand All @@ -297,7 +302,7 @@ void Dense<ValueType>::compute_norm2_impl(LinOp* result) const
auto exec = this->get_executor();
auto dense_res =
make_temporary_conversion<remove_complex<ValueType>>(result);
exec->run(dense::make_compute_norm2(this, dense_res.get()));
exec->run(dense::make_compute_norm2_dispatch(this, dense_res.get()));
}

template <typename ValueType>
Expand Down
23 changes: 23 additions & 0 deletions core/matrix/dense_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,12 +104,24 @@ namespace kernels {
const matrix::Diagonal<_type>* x, \
matrix::Dense<_type>* y)

#define GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(_type) \
void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* x, \
const matrix::Dense<_type>* y, \
matrix::Dense<_type>* result)

#define GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(_type) \
void compute_dot(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* x, \
const matrix::Dense<_type>* y, \
matrix::Dense<_type>* result)

#define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(_type) \
void compute_conj_dot_dispatch( \
std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* x, const matrix::Dense<_type>* y, \
matrix::Dense<_type>* result)

#define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(_type) \
void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* x, \
Expand All @@ -121,6 +133,11 @@ namespace kernels {
const matrix::Dense<_type>* x, \
matrix::Dense<remove_complex<_type>>* result)

#define GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(_type) \
void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* x, \
matrix::Dense<remove_complex<_type>>* result)

#define GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(_type) \
void compute_norm1(std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Dense<_type>* x, \
Expand Down Expand Up @@ -302,10 +319,16 @@ namespace kernels {
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType); \
template <typename ValueType> \
GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \
Expand Down
90 changes: 80 additions & 10 deletions cuda/matrix/dense_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,77 @@ constexpr int default_block_size = 512;


template <typename ValueType>
void simple_apply(std::shared_ptr<const CudaExecutor> exec,
void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* x,
const matrix::Dense<ValueType>* y,
matrix::Dense<ValueType>* result)
{
if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
if (cublas::is_supported<ValueType>::value) {
auto handle = exec->get_cublas_handle();
cublas::dot(handle, x->get_size()[0], x->get_const_values(),
x->get_stride(), y->get_const_values(), y->get_stride(),
result->get_values());
} else {
compute_dot(exec, x, y, result);
}
} else {
compute_dot(exec, x, y, result);
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);


template <typename ValueType>
void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* x,
const matrix::Dense<ValueType>* y,
matrix::Dense<ValueType>* result)
{
if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
if (cublas::is_supported<ValueType>::value) {
auto handle = exec->get_cublas_handle();
cublas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
x->get_stride(), y->get_const_values(),
y->get_stride(), result->get_values());
} else {
compute_conj_dot(exec, x, y, result);
}
} else {
compute_conj_dot(exec, x, y, result);
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);


template <typename ValueType>
void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* x,
matrix::Dense<remove_complex<ValueType>>* result)
{
if (x->get_size()[1] == 1) {
if (cublas::is_supported<ValueType>::value) {
auto handle = exec->get_cublas_handle();
cublas::norm2(handle, x->get_size()[0], x->get_const_values(),
x->get_stride(), result->get_values());
} else {
compute_norm2(exec, x, result);
}
} else {
compute_norm2(exec, x, result);
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);


template <typename ValueType>
void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* a,
const matrix::Dense<ValueType>* b,
matrix::Dense<ValueType>* c)
Expand Down Expand Up @@ -104,7 +174,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);


template <typename ValueType>
void apply(std::shared_ptr<const CudaExecutor> exec,
void apply(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* alpha,
const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
Expand All @@ -131,7 +201,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);


template <typename ValueType, typename IndexType>
void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* source,
const int64* row_ptrs,
matrix::Coo<ValueType, IndexType>* result)
Expand Down Expand Up @@ -160,7 +230,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(


template <typename ValueType, typename IndexType>
void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* source,
matrix::Csr<ValueType, IndexType>* result)
{
Expand Down Expand Up @@ -188,7 +258,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(


template <typename ValueType, typename IndexType>
void convert_to_ell(std::shared_ptr<const CudaExecutor> exec,
void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* source,
matrix::Ell<ValueType, IndexType>* result)
{
Expand Down Expand Up @@ -236,7 +306,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(


template <typename ValueType, typename IndexType>
void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* source,
const int64* coo_row_ptrs,
matrix::Hybrid<ValueType, IndexType>* result)
Expand Down Expand Up @@ -268,7 +338,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(


template <typename ValueType, typename IndexType>
void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* source,
matrix::Sellp<ValueType, IndexType>* result)
{
Expand Down Expand Up @@ -297,7 +367,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(


template <typename ValueType, typename IndexType>
void convert_to_sparsity_csr(std::shared_ptr<const CudaExecutor> exec,
void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* source,
matrix::SparsityCsr<ValueType, IndexType>* result)
{
Expand All @@ -323,7 +393,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(


template <typename ValueType>
void transpose(std::shared_ptr<const CudaExecutor> exec,
void transpose(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* orig,
matrix::Dense<ValueType>* trans)
{
Expand All @@ -348,7 +418,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);


template <typename ValueType>
void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
const matrix::Dense<ValueType>* orig,
matrix::Dense<ValueType>* trans)
{
Expand Down
16 changes: 15 additions & 1 deletion cuda/test/matrix/dense_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,21 @@ TEST_F(Dense, MultipleVectorCudaComputeConjDotIsEquivalentToRef)
}


TEST_F(Dense, CudaComputeNorm2IsEquivalentToRef)
TEST_F(Dense, SingleVectorCudaComputeNorm2IsEquivalentToRef)
{
set_up_vector_data(1);
auto norm_size = gko::dim<2>{1, x->get_size()[1]};
auto norm_expected = NormVector::create(this->ref, norm_size);
auto dnorm = NormVector::create(this->cuda, norm_size);

x->compute_norm2(norm_expected.get());
dx->compute_norm2(dnorm.get());

GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14);
}


TEST_F(Dense, MultipleVectorCudaComputeNorm2IsEquivalentToRef)
{
set_up_vector_data(20);
auto norm_size = gko::dim<2>{1, x->get_size()[1]};
Expand Down
Loading

0 comments on commit b34ae57

Please sign in to comment.