Merge: Use vendor implementations for some Dense kernels in Ginkgo

This PR uses the vendor implementations for some single vector dense kernels such as dot, conj_dot and norm2, because we expect the vendor implementation to be the fastest for these implementations. Related PR: #967
ginkgo-project · Feb 14, 2022 · b34ae57 · b34ae57
2 parents 5efed26 + 3ea6e0c
commit b34ae57
Show file tree

Hide file tree

Showing 12 changed files with 393 additions and 48 deletions.
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -255,8 +255,11 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);

diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
@@ -77,8 +77,12 @@ GKO_REGISTER_OPERATION(sub_scaled, dense::sub_scaled);
 GKO_REGISTER_OPERATION(add_scaled_diag, dense::add_scaled_diag);
 GKO_REGISTER_OPERATION(sub_scaled_diag, dense::sub_scaled_diag);
 GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot);
+GKO_REGISTER_OPERATION(compute_dot_dispatch, dense::compute_dot_dispatch);
 GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot);
+GKO_REGISTER_OPERATION(compute_conj_dot_dispatch,
+                       dense::compute_conj_dot_dispatch);
 GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2);
+GKO_REGISTER_OPERATION(compute_norm2_dispatch, dense::compute_norm2_dispatch);
 GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1);
 GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row);
 GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs,
@@ -272,7 +276,8 @@ void Dense<ValueType>::compute_dot_impl(const LinOp* b, LinOp* result) const
     auto exec = this->get_executor();
     auto dense_b = make_temporary_conversion<ValueType>(b);
     auto dense_res = make_temporary_conversion<ValueType>(result);
-    exec->run(dense::make_compute_dot(this, dense_b.get(), dense_res.get()));
+    exec->run(
+        dense::make_compute_dot_dispatch(this, dense_b.get(), dense_res.get()));
 }
 
 
@@ -285,8 +290,8 @@ void Dense<ValueType>::compute_conj_dot_impl(const LinOp* b,
     auto exec = this->get_executor();
     auto dense_b = make_temporary_conversion<ValueType>(b);
     auto dense_res = make_temporary_conversion<ValueType>(result);
-    exec->run(
-        dense::make_compute_conj_dot(this, dense_b.get(), dense_res.get()));
+    exec->run(dense::make_compute_conj_dot_dispatch(this, dense_b.get(),
+                                                    dense_res.get()));
 }
 
 
@@ -297,7 +302,7 @@ void Dense<ValueType>::compute_norm2_impl(LinOp* result) const
     auto exec = this->get_executor();
     auto dense_res =
         make_temporary_conversion<remove_complex<ValueType>>(result);
-    exec->run(dense::make_compute_norm2(this, dense_res.get()));
+    exec->run(dense::make_compute_norm2_dispatch(this, dense_res.get()));
 }
 
 template <typename ValueType>

diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
@@ -104,12 +104,24 @@ namespace kernels {
                          const matrix::Diagonal<_type>* x,            \
                          matrix::Dense<_type>* y)
 
+#define GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(_type)               \
+    void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec, \
+                              const matrix::Dense<_type>* x,               \
+                              const matrix::Dense<_type>* y,               \
+                              matrix::Dense<_type>* result)
+
 #define GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(_type)               \
     void compute_dot(std::shared_ptr<const DefaultExecutor> exec, \
                      const matrix::Dense<_type>* x,               \
                      const matrix::Dense<_type>* y,               \
                      matrix::Dense<_type>* result)
 
+#define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(_type)     \
+    void compute_conj_dot_dispatch(                                   \
+        std::shared_ptr<const DefaultExecutor> exec,                  \
+        const matrix::Dense<_type>* x, const matrix::Dense<_type>* y, \
+        matrix::Dense<_type>* result)
+
 #define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(_type)               \
     void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec, \
                           const matrix::Dense<_type>* x,               \
@@ -121,6 +133,11 @@ namespace kernels {
                        const matrix::Dense<_type>* x,               \
                        matrix::Dense<remove_complex<_type>>* result)
 
+#define GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(_type)               \
+    void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec, \
+                                const matrix::Dense<_type>* x,               \
+                                matrix::Dense<remove_complex<_type>>* result)
+
 #define GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(_type)               \
     void compute_norm1(std::shared_ptr<const DefaultExecutor> exec, \
                        const matrix::Dense<_type>* x,               \
@@ -302,10 +319,16 @@ namespace kernels {
     template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(ValueType);                        \
     template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(ValueType);               \
+    template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType);                   \
     template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(ValueType);          \
+    template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(ValueType);                      \
     template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType);             \
+    template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType);                      \
     template <typename ValueType, typename IndexType>                       \
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType);     \

diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
@@ -74,7 +74,77 @@ constexpr int default_block_size = 512;
 
 
 template <typename ValueType>
-void simple_apply(std::shared_ptr<const CudaExecutor> exec,
+void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result)
+{
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (cublas::is_supported<ValueType>::value) {
+            auto handle = exec->get_cublas_handle();
+            cublas::dot(handle, x->get_size()[0], x->get_const_values(),
+                        x->get_stride(), y->get_const_values(), y->get_stride(),
+                        result->get_values());
+        } else {
+            compute_dot(exec, x, y, result);
+        }
+    } else {
+        compute_dot(exec, x, y, result);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result)
+{
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (cublas::is_supported<ValueType>::value) {
+            auto handle = exec->get_cublas_handle();
+            cublas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
+                             x->get_stride(), y->get_const_values(),
+                             y->get_stride(), result->get_values());
+        } else {
+            compute_conj_dot(exec, x, y, result);
+        }
+    } else {
+        compute_conj_dot(exec, x, y, result);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result)
+{
+    if (x->get_size()[1] == 1) {
+        if (cublas::is_supported<ValueType>::value) {
+            auto handle = exec->get_cublas_handle();
+            cublas::norm2(handle, x->get_size()[0], x->get_const_values(),
+                          x->get_stride(), result->get_values());
+        } else {
+            compute_norm2(exec, x, result);
+        }
+    } else {
+        compute_norm2(exec, x, result);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
                   const matrix::Dense<ValueType>* b,
                   matrix::Dense<ValueType>* c)
@@ -104,7 +174,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
-void apply(std::shared_ptr<const CudaExecutor> exec,
+void apply(std::shared_ptr<const DefaultExecutor> exec,
            const matrix::Dense<ValueType>* alpha,
            const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
            const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
@@ -131,7 +201,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
+void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     const int64* row_ptrs,
                     matrix::Coo<ValueType, IndexType>* result)
@@ -160,7 +230,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
+void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     matrix::Csr<ValueType, IndexType>* result)
 {
@@ -188,7 +258,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_ell(std::shared_ptr<const CudaExecutor> exec,
+void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     matrix::Ell<ValueType, IndexType>* result)
 {
@@ -236,7 +306,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
+void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
                        const matrix::Dense<ValueType>* source,
                        const int64* coo_row_ptrs,
                        matrix::Hybrid<ValueType, IndexType>* result)
@@ -268,7 +338,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
+void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Dense<ValueType>* source,
                       matrix::Sellp<ValueType, IndexType>* result)
 {
@@ -297,7 +367,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_sparsity_csr(std::shared_ptr<const CudaExecutor> exec,
+void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
                              const matrix::Dense<ValueType>* source,
                              matrix::SparsityCsr<ValueType, IndexType>* result)
 {
@@ -323,7 +393,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType>
-void transpose(std::shared_ptr<const CudaExecutor> exec,
+void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Dense<ValueType>* orig,
                matrix::Dense<ValueType>* trans)
 {
@@ -348,7 +418,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
-void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
+void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* orig,
                     matrix::Dense<ValueType>* trans)
 {

diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp
@@ -226,7 +226,21 @@ TEST_F(Dense, MultipleVectorCudaComputeConjDotIsEquivalentToRef)
 }
 
 
-TEST_F(Dense, CudaComputeNorm2IsEquivalentToRef)
+TEST_F(Dense, SingleVectorCudaComputeNorm2IsEquivalentToRef)
+{
+    set_up_vector_data(1);
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->cuda, norm_size);
+
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorCudaComputeNorm2IsEquivalentToRef)
 {
     set_up_vector_data(20);
     auto norm_size = gko::dim<2>{1, x->get_size()[1]};