From be2478bb672f3e7a901060c2280d301d6341f61a Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 7 Feb 2022 22:48:43 +0100
Subject: [PATCH 1/7] Add strategy enum and set default to gko

---
 include/ginkgo/core/matrix/dense.hpp | 51 ++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 14 deletions(-)
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index e7fac1987f2..1e64c1682b3 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -155,6 +155,8 @@ class Dense
 
     using row_major_range = gko::range<gko::accessor::row_major<ValueType, 2>>;
 
+    enum class strategy_type { vendor, gko };
+
     /**
      * Creates a Dense matrix with the same size and stride as another Dense
      * matrix.
@@ -183,10 +185,11 @@ class Dense
      */
     static std::unique_ptr<Dense> create_with_type_of(
         const Dense* other, std::shared_ptr<const Executor> exec,
-        const dim<2>& size = dim<2>{})
+        const dim<2>& size = dim<2>{},
+        const strategy_type strategy = strategy_type::gko)
     {
         // See create_with_config_of()
-        return (*other).create_with_type_of_impl(exec, size, size[1]);
+        return (*other).create_with_type_of_impl(exec, size, size[1], strategy);
     }
 
     /**
@@ -199,10 +202,11 @@ class Dense
      */
     static std::unique_ptr<Dense> create_with_type_of(
         const Dense* other, std::shared_ptr<const Executor> exec,
-        const dim<2>& size, size_type stride)
+        const dim<2>& size, size_type stride,
+        const strategy_type strategy = strategy_type::gko)
     {
         // See create_with_config_of()
-        return (*other).create_with_type_of_impl(exec, size, stride);
+        return (*other).create_with_type_of_impl(exec, size, stride, strategy);
     }
 
     friend class Dense<next_precision<ValueType>>;
@@ -634,6 +638,19 @@ class Dense
         return values_.get_num_elems();
     }
 
+    /** Returns the strategy to be used for the operations. See @strategy_type
+     *
+     * @return the strategy
+     */
+    strategy_type get_strategy() const noexcept { return strategy_; }
+
+    /**
+     * Set the strategy
+     *
+     * @param strategy the dense strategy
+     */
+    void set_strategy(strategy_type strategy) { strategy_ = strategy; }
+
     /**
      * Returns a single element of the matrix.
      *
@@ -913,8 +930,9 @@ class Dense
      * @param exec  Executor associated to the matrix
      * @param size  size of the matrix
      */
-    Dense(std::shared_ptr<const Executor> exec, const dim<2>& size = dim<2>{})
-        : Dense(std::move(exec), size, size[1])
+    Dense(std::shared_ptr<const Executor> exec, const dim<2>& size = dim<2>{},
+          const strategy_type strategy = strategy_type::gko)
+        : Dense(std::move(exec), size, size[1], strategy)
     {}
 
     /**
@@ -927,10 +945,11 @@ class Dense
      *                  number of matrix elements)
      */
     Dense(std::shared_ptr<const Executor> exec, const dim<2>& size,
-          size_type stride)
+          size_type stride, const strategy_type strategy = strategy_type::gko)
         : EnableLinOp<Dense>(exec, size),
           values_(exec, size[0] * stride),
-          stride_(stride)
+          stride_(stride),
+          strategy_(strategy)
     {}
 
     /**
@@ -951,10 +970,12 @@ class Dense
      */
     template <typename ValuesArray>
     Dense(std::shared_ptr<const Executor> exec, const dim<2>& size,
-          ValuesArray&& values, size_type stride)
+          ValuesArray&& values, size_type stride,
+          const strategy_type strategy = strategy_type::gko)
         : EnableLinOp<Dense>(exec, size),
           values_{exec, std::forward<ValuesArray>(values)},
-          stride_{stride}
+          stride_{stride},
+          strategy_(strategy)
     {
         if (size[0] > 0 && size[1] > 0) {
             GKO_ENSURE_IN_BOUNDS((size[0] - 1) * stride + size[1] - 1,
@@ -971,7 +992,7 @@ class Dense
     virtual std::unique_ptr<Dense> create_with_same_config() const
     {
         return Dense::create(this->get_executor(), this->get_size(),
-                             this->get_stride());
+                             this->get_stride(), this->get_strategy());
     }
 
     /**
@@ -983,9 +1004,10 @@ class Dense
      */
     virtual std::unique_ptr<Dense> create_with_type_of_impl(
         std::shared_ptr<const Executor> exec, const dim<2>& size,
-        size_type stride) const
+        size_type stride,
+        const strategy_type strategy = strategy_type::gko) const
     {
-        return Dense::create(exec, size, stride);
+        return Dense::create(exec, size, stride, strategy);
     }
 
     template <typename IndexType>
@@ -1106,7 +1128,7 @@ class Dense
                 this->get_executor(),
                 range_result.length(0) * range_this.length(1) - columns.begin,
                 range_result->data),
-            stride);
+            stride, this->get_strategy());
     }
 
     void apply_impl(const LinOp* b, LinOp* x) const override;
@@ -1161,6 +1183,7 @@ class Dense
 private:
     Array<value_type> values_;
     size_type stride_;
+    strategy_type strategy_{strategy_type::gko};
 };
 
 

From e10863d0ff2d9e31267d851fc4347405d276475f Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Mon, 7 Feb 2022 22:49:13 +0100
Subject: [PATCH 2/7] Add core with switches and kernel impls

---
 core/device_hooks/common_kernels.inc.cpp |  1 +
 core/matrix/dense.cpp                    | 33 ++++++++++++++---
 core/matrix/dense_kernels.hpp            |  8 +++++
 cuda/matrix/dense_kernels.cu             | 45 ++++++++++++++++++------
 cuda/test/matrix/dense_kernels.cpp       | 22 ++++++++++++
 dpcpp/matrix/dense_kernels.dp.cpp        | 30 ++++++++++------
 hip/matrix/dense_kernels.hip.cpp         | 45 ++++++++++++++++++------
 hip/test/matrix/dense_kernels.hip.cpp    | 22 ++++++++++++
 omp/matrix/dense_kernels.cpp             | 30 ++++++++++------
 reference/matrix/dense_kernels.cpp       | 20 +++++++++++
 10 files changed, 211 insertions(+), 45 deletions(-)

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index c288aac4ea2..0c0606f6976 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -255,6 +255,7 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 1d61ac82c40..bbb4309af4e 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -77,6 +77,7 @@ GKO_REGISTER_OPERATION(sub_scaled, dense::sub_scaled);
 GKO_REGISTER_OPERATION(add_scaled_diag, dense::add_scaled_diag);
 GKO_REGISTER_OPERATION(sub_scaled_diag, dense::sub_scaled_diag);
 GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot);
+GKO_REGISTER_OPERATION(compute_dot_vendor, dense::compute_dot_vendor);
 GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot);
 GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2);
 GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1);
@@ -272,7 +273,14 @@ void Dense<ValueType>::compute_dot_impl(const LinOp* b, LinOp* result) const
     auto exec = this->get_executor();
     auto dense_b = make_temporary_conversion<ValueType>(b);
     auto dense_res = make_temporary_conversion<ValueType>(result);
-    exec->run(dense::make_compute_dot(this, dense_b.get(), dense_res.get()));
+    auto strat = this->get_strategy();
+    if (strat == strategy_type::gko) {
+        exec->run(
+            dense::make_compute_dot(this, dense_b.get(), dense_res.get()));
+    } else if (strat == strategy_type::vendor) {
+        exec->run(dense::make_compute_dot_vendor(this, dense_b.get(),
+                                                 dense_res.get()));
+    }
 }
 
 
@@ -285,8 +293,13 @@ void Dense<ValueType>::compute_conj_dot_impl(const LinOp* b,
     auto exec = this->get_executor();
     auto dense_b = make_temporary_conversion<ValueType>(b);
     auto dense_res = make_temporary_conversion<ValueType>(result);
-    exec->run(
-        dense::make_compute_conj_dot(this, dense_b.get(), dense_res.get()));
+    auto strat = this->get_strategy();
+    if (strat == strategy_type::gko) {
+        exec->run(
+            dense::make_compute_conj_dot(this, dense_b.get(), dense_res.get()));
+    } else if (strat == strategy_type::vendor) {
+        GKO_NOT_IMPLEMENTED;
+    }
 }
 
 
@@ -297,7 +310,12 @@ void Dense<ValueType>::compute_norm2_impl(LinOp* result) const
     auto exec = this->get_executor();
     auto dense_res =
         make_temporary_conversion<remove_complex<ValueType>>(result);
-    exec->run(dense::make_compute_norm2(this, dense_res.get()));
+    auto strat = this->get_strategy();
+    if (strat == strategy_type::gko) {
+        exec->run(dense::make_compute_norm2(this, dense_res.get()));
+    } else if (strat == strategy_type::vendor) {
+        GKO_NOT_IMPLEMENTED;
+    }
 }
 
 template <typename ValueType>
@@ -307,7 +325,12 @@ void Dense<ValueType>::compute_norm1_impl(LinOp* result) const
     auto exec = this->get_executor();
     auto dense_res =
         make_temporary_conversion<remove_complex<ValueType>>(result);
-    exec->run(dense::make_compute_norm1(this, dense_res.get()));
+    auto strat = this->get_strategy();
+    if (strat == strategy_type::gko) {
+        exec->run(dense::make_compute_norm1(this, dense_res.get()));
+    } else if (strat == strategy_type::vendor) {
+        GKO_NOT_IMPLEMENTED;
+    }
 }
 
 template <typename ValueType>
diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
index 1cb428c8b0d..3c9f5d61133 100644
--- a/core/matrix/dense_kernels.hpp
+++ b/core/matrix/dense_kernels.hpp
@@ -110,6 +110,12 @@ namespace kernels {
                      const matrix::Dense<_type>* y,               \
                      matrix::Dense<_type>* result)
 
+#define GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL(_type)               \
+    void compute_dot_vendor(std::shared_ptr<const DefaultExecutor> exec, \
+                            const matrix::Dense<_type>* x,               \
+                            const matrix::Dense<_type>* y,               \
+                            matrix::Dense<_type>* result)
+
 #define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(_type)               \
     void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec, \
                           const matrix::Dense<_type>* x,               \
@@ -302,6 +308,8 @@ namespace kernels {
     template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(ValueType);                        \
     template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL(ValueType);                 \
+    template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType);                   \
     template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(ValueType);                      \
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 7d2fc8defc7..1df5e9d3356 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -74,7 +74,32 @@ constexpr int default_block_size = 512;
 
 
 template <typename ValueType>
-void simple_apply(std::shared_ptr<const CudaExecutor> exec,
+void compute_dot_vendor(std::shared_ptr<const DefaultExecutor> exec,
+                        const matrix::Dense<ValueType>* x,
+                        const matrix::Dense<ValueType>* y,
+                        matrix::Dense<ValueType>* result)
+{
+    if (cublas::is_supported<ValueType>::value) {
+        auto handle = exec->get_cublas_handle();
+        if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+            cublas::pointer_mode_guard pm_guard(handle);
+            cublas::dot(handle, x->get_size()[0], x->get_const_values(),
+                        x->get_size()[1], y->get_const_values(),
+                        y->get_size()[1], result->get_values());
+        } else {
+            GKO_NOT_IMPLEMENTED;
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL);
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
                   const matrix::Dense<ValueType>* b,
                   matrix::Dense<ValueType>* c)
@@ -104,7 +129,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
-void apply(std::shared_ptr<const CudaExecutor> exec,
+void apply(std::shared_ptr<const DefaultExecutor> exec,
            const matrix::Dense<ValueType>* alpha,
            const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
            const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
@@ -131,7 +156,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
+void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     const int64* row_ptrs,
                     matrix::Coo<ValueType, IndexType>* result)
@@ -160,7 +185,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
+void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     matrix::Csr<ValueType, IndexType>* result)
 {
@@ -188,7 +213,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_ell(std::shared_ptr<const CudaExecutor> exec,
+void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     matrix::Ell<ValueType, IndexType>* result)
 {
@@ -236,7 +261,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
+void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
                        const matrix::Dense<ValueType>* source,
                        const int64* coo_row_ptrs,
                        matrix::Hybrid<ValueType, IndexType>* result)
@@ -268,7 +293,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
+void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Dense<ValueType>* source,
                       matrix::Sellp<ValueType, IndexType>* result)
 {
@@ -297,7 +322,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_sparsity_csr(std::shared_ptr<const CudaExecutor> exec,
+void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
                              const matrix::Dense<ValueType>* source,
                              matrix::SparsityCsr<ValueType, IndexType>* result)
 {
@@ -323,7 +348,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType>
-void transpose(std::shared_ptr<const CudaExecutor> exec,
+void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Dense<ValueType>* orig,
                matrix::Dense<ValueType>* trans)
 {
@@ -348,7 +373,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
-void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
+void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* orig,
                     matrix::Dense<ValueType>* trans)
 {
diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp
index a45274cdb9b..a4d9622a16f 100644
--- a/cuda/test/matrix/dense_kernels.cpp
+++ b/cuda/test/matrix/dense_kernels.cpp
@@ -193,6 +193,28 @@ TEST_F(Dense, SingleVectorCudaComputeDotIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, SingleVectorCudaComputeDotVendorIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+    dx->set_strategy(Mtx::strategy_type::vendor);
+
+    x->compute_dot(y.get(), expected.get());
+    dx->compute_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorCudaComputeDotVendorFails)
+{
+    set_up_vector_data(20);
+    dx->set_strategy(Mtx::strategy_type::vendor);
+
+    x->compute_dot(y.get(), expected.get());
+    ASSERT_THROW(dx->compute_dot(dy.get(), dresult.get()), gko::NotImplemented);
+}
+
+
 TEST_F(Dense, MultipleVectorCudaComputeDotIsEquivalentToRef)
 {
     set_up_vector_data(20);
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 618fab1ee29..17d5e74be28 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -190,7 +190,17 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(conj_transpose_call, conj_transpose,
 
 
 template <typename ValueType>
-void simple_apply(std::shared_ptr<const DpcppExecutor> exec,
+void compute_dot_vendor(std::shared_ptr<const DefaultExecutor> exec,
+                        const matrix::Dense<ValueType>* x,
+                        const matrix::Dense<ValueType>* y,
+                        matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL);
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
                   const matrix::Dense<ValueType>* b,
                   matrix::Dense<ValueType>* c)
@@ -207,7 +217,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
-void apply(std::shared_ptr<const DpcppExecutor> exec,
+void apply(std::shared_ptr<const DefaultExecutor> exec,
            const matrix::Dense<ValueType>* alpha,
            const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
            const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
@@ -226,7 +236,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_coo(std::shared_ptr<const DpcppExecutor> exec,
+void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     const int64* row_ptrs,
                     matrix::Coo<ValueType, IndexType>* result)
@@ -262,7 +272,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_csr(std::shared_ptr<const DpcppExecutor> exec,
+void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     matrix::Csr<ValueType, IndexType>* result)
 {
@@ -296,7 +306,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_ell(std::shared_ptr<const DpcppExecutor> exec,
+void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     matrix::Ell<ValueType, IndexType>* result)
 {
@@ -355,7 +365,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_hybrid(std::shared_ptr<const DpcppExecutor> exec,
+void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
                        const matrix::Dense<ValueType>* source,
                        const int64* coo_row_ptrs,
                        matrix::Hybrid<ValueType, IndexType>* result)
@@ -411,7 +421,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_sellp(std::shared_ptr<const DpcppExecutor> exec,
+void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Dense<ValueType>* source,
                       matrix::Sellp<ValueType, IndexType>* result)
 {
@@ -454,7 +464,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_sparsity_csr(std::shared_ptr<const DpcppExecutor> exec,
+void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
                              const matrix::Dense<ValueType>* source,
                              matrix::SparsityCsr<ValueType, IndexType>* result)
 {
@@ -486,7 +496,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType>
-void transpose(std::shared_ptr<const DpcppExecutor> exec,
+void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Dense<ValueType>* orig,
                matrix::Dense<ValueType>* trans)
 {
@@ -508,7 +518,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
-void conj_transpose(std::shared_ptr<const DpcppExecutor> exec,
+void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* orig,
                     matrix::Dense<ValueType>* trans)
 {
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 63e42fbea5b..77b7f36806b 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -77,7 +77,32 @@ constexpr int default_block_size = 512;
 
 
 template <typename ValueType>
-void simple_apply(std::shared_ptr<const HipExecutor> exec,
+void compute_dot_vendor(std::shared_ptr<const DefaultExecutor> exec,
+                        const matrix::Dense<ValueType>* x,
+                        const matrix::Dense<ValueType>* y,
+                        matrix::Dense<ValueType>* result)
+{
+    if (hipblas::is_supported<ValueType>::value) {
+        auto handle = exec->get_hipblas_handle();
+        if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+            hipblas::pointer_mode_guard pm_guard(handle);
+            hipblas::dot(handle, x->get_size()[0], x->get_const_values(),
+                         x->get_size()[1], y->get_const_values(),
+                         y->get_size()[1], result->get_values());
+        } else {
+            GKO_NOT_IMPLEMENTED;
+        }
+    } else {
+        GKO_NOT_IMPLEMENTED;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL);
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
                   const matrix::Dense<ValueType>* b,
                   matrix::Dense<ValueType>* c)
@@ -108,7 +133,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
-void apply(std::shared_ptr<const HipExecutor> exec,
+void apply(std::shared_ptr<const DefaultExecutor> exec,
            const matrix::Dense<ValueType>* alpha,
            const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
            const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
@@ -135,7 +160,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_coo(std::shared_ptr<const HipExecutor> exec,
+void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     const int64* row_ptrs,
                     matrix::Coo<ValueType, IndexType>* result)
@@ -164,7 +189,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
+void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     matrix::Csr<ValueType, IndexType>* result)
 {
@@ -192,7 +217,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_ell(std::shared_ptr<const HipExecutor> exec,
+void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     matrix::Ell<ValueType, IndexType>* result)
 {
@@ -241,7 +266,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_hybrid(std::shared_ptr<const HipExecutor> exec,
+void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
                        const matrix::Dense<ValueType>* source,
                        const int64* coo_row_ptrs,
                        matrix::Hybrid<ValueType, IndexType>* result)
@@ -274,7 +299,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_sellp(std::shared_ptr<const HipExecutor> exec,
+void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Dense<ValueType>* source,
                       matrix::Sellp<ValueType, IndexType>* result)
 {
@@ -305,7 +330,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_sparsity_csr(std::shared_ptr<const HipExecutor> exec,
+void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
                              const matrix::Dense<ValueType>* source,
                              matrix::SparsityCsr<ValueType, IndexType>* result)
 {
@@ -332,7 +357,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType>
-void transpose(std::shared_ptr<const HipExecutor> exec,
+void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Dense<ValueType>* orig,
                matrix::Dense<ValueType>* trans)
 {
@@ -357,7 +382,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
-void conj_transpose(std::shared_ptr<const HipExecutor> exec,
+void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* orig,
                     matrix::Dense<ValueType>* trans)
 {
diff --git a/hip/test/matrix/dense_kernels.hip.cpp b/hip/test/matrix/dense_kernels.hip.cpp
index 894b5fcc408..e111a10b761 100644
--- a/hip/test/matrix/dense_kernels.hip.cpp
+++ b/hip/test/matrix/dense_kernels.hip.cpp
@@ -189,6 +189,28 @@ TEST_F(Dense, SingleVectorHipComputeDotIsEquivalentToRef)
 }
 
 
+TEST_F(Dense, SingleVectorHipComputeDotVendorIsEquivalentToRef)
+{
+    set_up_vector_data(1);
+    dx->set_strategy(Mtx::strategy_type::vendor);
+
+    x->compute_dot(y.get(), expected.get());
+    dx->compute_dot(dy.get(), dresult.get());
+
+    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorHipComputeDotVendorFails)
+{
+    set_up_vector_data(20);
+    dx->set_strategy(Mtx::strategy_type::vendor);
+
+    x->compute_dot(y.get(), expected.get());
+    ASSERT_THROW(dx->compute_dot(dy.get(), dresult.get()), gko::NotImplemented);
+}
+
+
 TEST_F(Dense, MultipleVectorHipComputeDotIsEquivalentToRef)
 {
     set_up_vector_data(20);
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index dd47a2b3ed7..df78b50a29a 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -69,7 +69,17 @@ namespace dense {
 
 
 template <typename ValueType>
-void simple_apply(std::shared_ptr<const OmpExecutor> exec,
+void compute_dot_vendor(std::shared_ptr<const DefaultExecutor> exec,
+                        const matrix::Dense<ValueType>* x,
+                        const matrix::Dense<ValueType>* y,
+                        matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL);
+
+
+template <typename ValueType>
+void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
                   const matrix::Dense<ValueType>* b,
                   matrix::Dense<ValueType>* c)
@@ -95,7 +105,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_SIMPLE_APPLY_KERNEL);
 
 
 template <typename ValueType>
-void apply(std::shared_ptr<const OmpExecutor> exec,
+void apply(std::shared_ptr<const DefaultExecutor> exec,
            const matrix::Dense<ValueType>* alpha,
            const matrix::Dense<ValueType>* a, const matrix::Dense<ValueType>* b,
            const matrix::Dense<ValueType>* beta, matrix::Dense<ValueType>* c)
@@ -131,7 +141,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_APPLY_KERNEL);
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_coo(std::shared_ptr<const OmpExecutor> exec,
+void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     const int64* row_ptrs,
                     matrix::Coo<ValueType, IndexType>* result)
@@ -164,7 +174,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_csr(std::shared_ptr<const OmpExecutor> exec,
+void convert_to_csr(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     matrix::Csr<ValueType, IndexType>* result)
 {
@@ -195,7 +205,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_ell(std::shared_ptr<const OmpExecutor> exec,
+void convert_to_ell(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* source,
                     matrix::Ell<ValueType, IndexType>* result)
 {
@@ -276,7 +286,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_hybrid(std::shared_ptr<const OmpExecutor> exec,
+void convert_to_hybrid(std::shared_ptr<const DefaultExecutor> exec,
                        const matrix::Dense<ValueType>* source,
                        const int64* coo_row_ptrs,
                        matrix::Hybrid<ValueType, IndexType>* result)
@@ -322,7 +332,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_sellp(std::shared_ptr<const OmpExecutor> exec,
+void convert_to_sellp(std::shared_ptr<const DefaultExecutor> exec,
                       const matrix::Dense<ValueType>* source,
                       matrix::Sellp<ValueType, IndexType>* result)
 {
@@ -364,7 +374,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType, typename IndexType>
-void convert_to_sparsity_csr(std::shared_ptr<const OmpExecutor> exec,
+void convert_to_sparsity_csr(std::shared_ptr<const DefaultExecutor> exec,
                              const matrix::Dense<ValueType>* source,
                              matrix::SparsityCsr<ValueType, IndexType>* result)
 {
@@ -394,7 +404,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
 
 
 template <typename ValueType>
-void transpose(std::shared_ptr<const OmpExecutor> exec,
+void transpose(std::shared_ptr<const DefaultExecutor> exec,
                const matrix::Dense<ValueType>* orig,
                matrix::Dense<ValueType>* trans)
 {
@@ -410,7 +420,7 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_TRANSPOSE_KERNEL);
 
 
 template <typename ValueType>
-void conj_transpose(std::shared_ptr<const OmpExecutor> exec,
+void conj_transpose(std::shared_ptr<const DefaultExecutor> exec,
                     const matrix::Dense<ValueType>* orig,
                     matrix::Dense<ValueType>* trans)
 {
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index c45f3316414..f8321c2857c 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -297,6 +297,26 @@ void compute_dot(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 
 
+template <typename ValueType>
+void compute_dot_vendor(std::shared_ptr<const ReferenceExecutor> exec,
+                        const matrix::Dense<ValueType>* x,
+                        const matrix::Dense<ValueType>* y,
+                        matrix::Dense<ValueType>* result)
+{
+    for (size_type j = 0; j < x->get_size()[1]; ++j) {
+        result->at(0, j) = zero<ValueType>();
+    }
+    for (size_type i = 0; i < x->get_size()[0]; ++i) {
+        for (size_type j = 0; j < x->get_size()[1]; ++j) {
+            result->at(0, j) += x->at(i, j) * y->at(i, j);
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL);
+
+
 template <typename ValueType>
 void compute_conj_dot(std::shared_ptr<const ReferenceExecutor> exec,
                       const matrix::Dense<ValueType>* x,

From fc5efa791a838d561ab663d03db322af31a0d9b9 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 8 Feb 2022 15:10:12 +0100
Subject: [PATCH 3/7] Move the kernel_impl switch inside the kernel.

---
 core/device_hooks/common_kernels.inc.cpp |  2 +-
 core/matrix/dense.cpp                    | 35 ++++------------
 core/matrix/dense_kernels.hpp            | 14 +++----
 cuda/matrix/dense_kernels.cu             | 18 ++++-----
 cuda/test/matrix/dense_kernels.cpp       | 22 ----------
 dpcpp/matrix/dense_kernels.dp.cpp        | 14 ++++---
 hip/matrix/dense_kernels.hip.cpp         | 18 ++++-----
 hip/test/matrix/dense_kernels.hip.cpp    | 22 ----------
 include/ginkgo/core/matrix/dense.hpp     | 51 +++++++-----------------
 omp/matrix/dense_kernels.cpp             | 14 ++++---
 reference/matrix/dense_kernels.cpp       | 19 +++------
 11 files changed, 71 insertions(+), 158 deletions(-)

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 0c0606f6976..015c83cfdc3 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -255,7 +255,7 @@ GKO_STUB_VALUE_AND_SCALAR_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_ADD_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
-GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index bbb4309af4e..94a565ffa0c 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -77,7 +77,7 @@ GKO_REGISTER_OPERATION(sub_scaled, dense::sub_scaled);
 GKO_REGISTER_OPERATION(add_scaled_diag, dense::add_scaled_diag);
 GKO_REGISTER_OPERATION(sub_scaled_diag, dense::sub_scaled_diag);
 GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot);
-GKO_REGISTER_OPERATION(compute_dot_vendor, dense::compute_dot_vendor);
+GKO_REGISTER_OPERATION(compute_dot_dispatch, dense::compute_dot_dispatch);
 GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot);
 GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2);
 GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1);
@@ -273,14 +273,8 @@ void Dense<ValueType>::compute_dot_impl(const LinOp* b, LinOp* result) const
     auto exec = this->get_executor();
     auto dense_b = make_temporary_conversion<ValueType>(b);
     auto dense_res = make_temporary_conversion<ValueType>(result);
-    auto strat = this->get_strategy();
-    if (strat == strategy_type::gko) {
-        exec->run(
-            dense::make_compute_dot(this, dense_b.get(), dense_res.get()));
-    } else if (strat == strategy_type::vendor) {
-        exec->run(dense::make_compute_dot_vendor(this, dense_b.get(),
-                                                 dense_res.get()));
-    }
+    exec->run(
+        dense::make_compute_dot_dispatch(this, dense_b.get(), dense_res.get()));
 }
 
 
@@ -293,13 +287,8 @@ void Dense<ValueType>::compute_conj_dot_impl(const LinOp* b,
     auto exec = this->get_executor();
     auto dense_b = make_temporary_conversion<ValueType>(b);
     auto dense_res = make_temporary_conversion<ValueType>(result);
-    auto strat = this->get_strategy();
-    if (strat == strategy_type::gko) {
-        exec->run(
-            dense::make_compute_conj_dot(this, dense_b.get(), dense_res.get()));
-    } else if (strat == strategy_type::vendor) {
-        GKO_NOT_IMPLEMENTED;
-    }
+    exec->run(
+        dense::make_compute_conj_dot(this, dense_b.get(), dense_res.get()));
 }
 
 
@@ -310,12 +299,7 @@ void Dense<ValueType>::compute_norm2_impl(LinOp* result) const
     auto exec = this->get_executor();
     auto dense_res =
         make_temporary_conversion<remove_complex<ValueType>>(result);
-    auto strat = this->get_strategy();
-    if (strat == strategy_type::gko) {
-        exec->run(dense::make_compute_norm2(this, dense_res.get()));
-    } else if (strat == strategy_type::vendor) {
-        GKO_NOT_IMPLEMENTED;
-    }
+    exec->run(dense::make_compute_norm2(this, dense_res.get()));
 }
 
 template <typename ValueType>
@@ -325,12 +309,7 @@ void Dense<ValueType>::compute_norm1_impl(LinOp* result) const
     auto exec = this->get_executor();
     auto dense_res =
         make_temporary_conversion<remove_complex<ValueType>>(result);
-    auto strat = this->get_strategy();
-    if (strat == strategy_type::gko) {
-        exec->run(dense::make_compute_norm1(this, dense_res.get()));
-    } else if (strat == strategy_type::vendor) {
-        GKO_NOT_IMPLEMENTED;
-    }
+    exec->run(dense::make_compute_norm1(this, dense_res.get()));
 }
 
 template <typename ValueType>
diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
index 3c9f5d61133..45adb45ae71 100644
--- a/core/matrix/dense_kernels.hpp
+++ b/core/matrix/dense_kernels.hpp
@@ -104,18 +104,18 @@ namespace kernels {
                          const matrix::Diagonal<_type>* x,            \
                          matrix::Dense<_type>* y)
 
+#define GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(_type)               \
+    void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec, \
+                              const matrix::Dense<_type>* x,               \
+                              const matrix::Dense<_type>* y,               \
+                              matrix::Dense<_type>* result)
+
 #define GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(_type)               \
     void compute_dot(std::shared_ptr<const DefaultExecutor> exec, \
                      const matrix::Dense<_type>* x,               \
                      const matrix::Dense<_type>* y,               \
                      matrix::Dense<_type>* result)
 
-#define GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL(_type)               \
-    void compute_dot_vendor(std::shared_ptr<const DefaultExecutor> exec, \
-                            const matrix::Dense<_type>* x,               \
-                            const matrix::Dense<_type>* y,               \
-                            matrix::Dense<_type>* result)
-
 #define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(_type)               \
     void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec, \
                           const matrix::Dense<_type>* x,               \
@@ -308,7 +308,7 @@ namespace kernels {
     template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL(ValueType);                        \
     template <typename ValueType>                                           \
-    GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL(ValueType);                 \
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL(ValueType);               \
     template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType);                   \
     template <typename ValueType>                                           \
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 1df5e9d3356..0a40e840e84 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -74,14 +74,14 @@ constexpr int default_block_size = 512;
 
 
 template <typename ValueType>
-void compute_dot_vendor(std::shared_ptr<const DefaultExecutor> exec,
-                        const matrix::Dense<ValueType>* x,
-                        const matrix::Dense<ValueType>* y,
-                        matrix::Dense<ValueType>* result)
+void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result)
 {
-    if (cublas::is_supported<ValueType>::value) {
-        auto handle = exec->get_cublas_handle();
-        if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (cublas::is_supported<ValueType>::value) {
+            auto handle = exec->get_cublas_handle();
             cublas::pointer_mode_guard pm_guard(handle);
             cublas::dot(handle, x->get_size()[0], x->get_const_values(),
                         x->get_size()[1], y->get_const_values(),
@@ -90,12 +90,12 @@ void compute_dot_vendor(std::shared_ptr<const DefaultExecutor> exec,
             GKO_NOT_IMPLEMENTED;
         }
     } else {
-        GKO_NOT_IMPLEMENTED;
+        compute_dot(exec, x, y, result);
     }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL);
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
 template <typename ValueType>
diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp
index a4d9622a16f..a45274cdb9b 100644
--- a/cuda/test/matrix/dense_kernels.cpp
+++ b/cuda/test/matrix/dense_kernels.cpp
@@ -193,28 +193,6 @@ TEST_F(Dense, SingleVectorCudaComputeDotIsEquivalentToRef)
 }
 
 
-TEST_F(Dense, SingleVectorCudaComputeDotVendorIsEquivalentToRef)
-{
-    set_up_vector_data(1);
-    dx->set_strategy(Mtx::strategy_type::vendor);
-
-    x->compute_dot(y.get(), expected.get());
-    dx->compute_dot(dy.get(), dresult.get());
-
-    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
-}
-
-
-TEST_F(Dense, MultipleVectorCudaComputeDotVendorFails)
-{
-    set_up_vector_data(20);
-    dx->set_strategy(Mtx::strategy_type::vendor);
-
-    x->compute_dot(y.get(), expected.get());
-    ASSERT_THROW(dx->compute_dot(dy.get(), dresult.get()), gko::NotImplemented);
-}
-
-
 TEST_F(Dense, MultipleVectorCudaComputeDotIsEquivalentToRef)
 {
     set_up_vector_data(20);
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 17d5e74be28..705155b0ea8 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -190,13 +190,17 @@ GKO_ENABLE_DEFAULT_CONFIG_CALL(conj_transpose_call, conj_transpose,
 
 
 template <typename ValueType>
-void compute_dot_vendor(std::shared_ptr<const DefaultExecutor> exec,
-                        const matrix::Dense<ValueType>* x,
-                        const matrix::Dense<ValueType>* y,
-                        matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
+void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result)
+{
+    // TODO Add onemkl for single column
+    compute_dot(exec, x, y, result);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL);
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
 template <typename ValueType>
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 77b7f36806b..b7a22d1c87e 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -77,14 +77,14 @@ constexpr int default_block_size = 512;
 
 
 template <typename ValueType>
-void compute_dot_vendor(std::shared_ptr<const DefaultExecutor> exec,
-                        const matrix::Dense<ValueType>* x,
-                        const matrix::Dense<ValueType>* y,
-                        matrix::Dense<ValueType>* result)
+void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result)
 {
-    if (hipblas::is_supported<ValueType>::value) {
-        auto handle = exec->get_hipblas_handle();
-        if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (hipblas::is_supported<ValueType>::value) {
+            auto handle = exec->get_hipblas_handle();
             hipblas::pointer_mode_guard pm_guard(handle);
             hipblas::dot(handle, x->get_size()[0], x->get_const_values(),
                          x->get_size()[1], y->get_const_values(),
@@ -93,12 +93,12 @@ void compute_dot_vendor(std::shared_ptr<const DefaultExecutor> exec,
             GKO_NOT_IMPLEMENTED;
         }
     } else {
-        GKO_NOT_IMPLEMENTED;
+        compute_dot(exec, x, y, result);
     }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL);
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
 template <typename ValueType>
diff --git a/hip/test/matrix/dense_kernels.hip.cpp b/hip/test/matrix/dense_kernels.hip.cpp
index e111a10b761..894b5fcc408 100644
--- a/hip/test/matrix/dense_kernels.hip.cpp
+++ b/hip/test/matrix/dense_kernels.hip.cpp
@@ -189,28 +189,6 @@ TEST_F(Dense, SingleVectorHipComputeDotIsEquivalentToRef)
 }
 
 
-TEST_F(Dense, SingleVectorHipComputeDotVendorIsEquivalentToRef)
-{
-    set_up_vector_data(1);
-    dx->set_strategy(Mtx::strategy_type::vendor);
-
-    x->compute_dot(y.get(), expected.get());
-    dx->compute_dot(dy.get(), dresult.get());
-
-    GKO_ASSERT_MTX_NEAR(dresult, expected, 1e-14);
-}
-
-
-TEST_F(Dense, MultipleVectorHipComputeDotVendorFails)
-{
-    set_up_vector_data(20);
-    dx->set_strategy(Mtx::strategy_type::vendor);
-
-    x->compute_dot(y.get(), expected.get());
-    ASSERT_THROW(dx->compute_dot(dy.get(), dresult.get()), gko::NotImplemented);
-}
-
-
 TEST_F(Dense, MultipleVectorHipComputeDotIsEquivalentToRef)
 {
     set_up_vector_data(20);
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index 1e64c1682b3..e7fac1987f2 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -155,8 +155,6 @@ class Dense
 
     using row_major_range = gko::range<gko::accessor::row_major<ValueType, 2>>;
 
-    enum class strategy_type { vendor, gko };
-
     /**
      * Creates a Dense matrix with the same size and stride as another Dense
      * matrix.
@@ -185,11 +183,10 @@ class Dense
      */
     static std::unique_ptr<Dense> create_with_type_of(
         const Dense* other, std::shared_ptr<const Executor> exec,
-        const dim<2>& size = dim<2>{},
-        const strategy_type strategy = strategy_type::gko)
+        const dim<2>& size = dim<2>{})
     {
         // See create_with_config_of()
-        return (*other).create_with_type_of_impl(exec, size, size[1], strategy);
+        return (*other).create_with_type_of_impl(exec, size, size[1]);
     }
 
     /**
@@ -202,11 +199,10 @@ class Dense
      */
     static std::unique_ptr<Dense> create_with_type_of(
         const Dense* other, std::shared_ptr<const Executor> exec,
-        const dim<2>& size, size_type stride,
-        const strategy_type strategy = strategy_type::gko)
+        const dim<2>& size, size_type stride)
     {
         // See create_with_config_of()
-        return (*other).create_with_type_of_impl(exec, size, stride, strategy);
+        return (*other).create_with_type_of_impl(exec, size, stride);
     }
 
     friend class Dense<next_precision<ValueType>>;
@@ -638,19 +634,6 @@ class Dense
         return values_.get_num_elems();
     }
 
-    /** Returns the strategy to be used for the operations. See @strategy_type
-     *
-     * @return the strategy
-     */
-    strategy_type get_strategy() const noexcept { return strategy_; }
-
-    /**
-     * Set the strategy
-     *
-     * @param strategy the dense strategy
-     */
-    void set_strategy(strategy_type strategy) { strategy_ = strategy; }
-
     /**
      * Returns a single element of the matrix.
      *
@@ -930,9 +913,8 @@ class Dense
      * @param exec  Executor associated to the matrix
      * @param size  size of the matrix
      */
-    Dense(std::shared_ptr<const Executor> exec, const dim<2>& size = dim<2>{},
-          const strategy_type strategy = strategy_type::gko)
-        : Dense(std::move(exec), size, size[1], strategy)
+    Dense(std::shared_ptr<const Executor> exec, const dim<2>& size = dim<2>{})
+        : Dense(std::move(exec), size, size[1])
     {}
 
     /**
@@ -945,11 +927,10 @@ class Dense
      *                  number of matrix elements)
      */
     Dense(std::shared_ptr<const Executor> exec, const dim<2>& size,
-          size_type stride, const strategy_type strategy = strategy_type::gko)
+          size_type stride)
         : EnableLinOp<Dense>(exec, size),
           values_(exec, size[0] * stride),
-          stride_(stride),
-          strategy_(strategy)
+          stride_(stride)
     {}
 
     /**
@@ -970,12 +951,10 @@ class Dense
      */
     template <typename ValuesArray>
     Dense(std::shared_ptr<const Executor> exec, const dim<2>& size,
-          ValuesArray&& values, size_type stride,
-          const strategy_type strategy = strategy_type::gko)
+          ValuesArray&& values, size_type stride)
         : EnableLinOp<Dense>(exec, size),
           values_{exec, std::forward<ValuesArray>(values)},
-          stride_{stride},
-          strategy_(strategy)
+          stride_{stride}
     {
         if (size[0] > 0 && size[1] > 0) {
             GKO_ENSURE_IN_BOUNDS((size[0] - 1) * stride + size[1] - 1,
@@ -992,7 +971,7 @@ class Dense
     virtual std::unique_ptr<Dense> create_with_same_config() const
     {
         return Dense::create(this->get_executor(), this->get_size(),
-                             this->get_stride(), this->get_strategy());
+                             this->get_stride());
     }
 
     /**
@@ -1004,10 +983,9 @@ class Dense
      */
     virtual std::unique_ptr<Dense> create_with_type_of_impl(
         std::shared_ptr<const Executor> exec, const dim<2>& size,
-        size_type stride,
-        const strategy_type strategy = strategy_type::gko) const
+        size_type stride) const
     {
-        return Dense::create(exec, size, stride, strategy);
+        return Dense::create(exec, size, stride);
     }
 
     template <typename IndexType>
@@ -1128,7 +1106,7 @@ class Dense
                 this->get_executor(),
                 range_result.length(0) * range_this.length(1) - columns.begin,
                 range_result->data),
-            stride, this->get_strategy());
+            stride);
     }
 
     void apply_impl(const LinOp* b, LinOp* x) const override;
@@ -1183,7 +1161,6 @@ class Dense
 private:
     Array<value_type> values_;
     size_type stride_;
-    strategy_type strategy_{strategy_type::gko};
 };
 
 
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index df78b50a29a..924832456f7 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -69,13 +69,17 @@ namespace dense {
 
 
 template <typename ValueType>
-void compute_dot_vendor(std::shared_ptr<const DefaultExecutor> exec,
-                        const matrix::Dense<ValueType>* x,
-                        const matrix::Dense<ValueType>* y,
-                        matrix::Dense<ValueType>* result) GKO_NOT_IMPLEMENTED;
+void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result)
+{
+    // OpenMP uses the unified kernel.
+    compute_dot(exec, x, y, result);
+}
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL);
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
 template <typename ValueType>
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index f8321c2857c..4cfc4246976 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -298,23 +298,16 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 
 
 template <typename ValueType>
-void compute_dot_vendor(std::shared_ptr<const ReferenceExecutor> exec,
-                        const matrix::Dense<ValueType>* x,
-                        const matrix::Dense<ValueType>* y,
-                        matrix::Dense<ValueType>* result)
+void compute_dot_dispatch(std::shared_ptr<const ReferenceExecutor> exec,
+                          const matrix::Dense<ValueType>* x,
+                          const matrix::Dense<ValueType>* y,
+                          matrix::Dense<ValueType>* result)
 {
-    for (size_type j = 0; j < x->get_size()[1]; ++j) {
-        result->at(0, j) = zero<ValueType>();
-    }
-    for (size_type i = 0; i < x->get_size()[0]; ++i) {
-        for (size_type j = 0; j < x->get_size()[1]; ++j) {
-            result->at(0, j) += x->at(i, j) * y->at(i, j);
-        }
-    }
+    compute_dot(exec, x, y, result);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
-    GKO_DECLARE_DENSE_COMPUTE_DOT_VENDOR_KERNEL);
+    GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
 template <typename ValueType>

From fd49a7fb5f9d4aa585fc0525b8b2745743c197ae Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Thu, 10 Feb 2022 22:25:42 +0100
Subject: [PATCH 4/7] Move single vector conj_dot and nrm2 to vendor

---
 core/device_hooks/common_kernels.inc.cpp |  2 +
 core/matrix/dense.cpp                    |  9 ++--
 core/matrix/dense_kernels.hpp            | 15 +++++++
 cuda/matrix/dense_kernels.cu             | 52 +++++++++++++++++++++++-
 hip/matrix/dense_kernels.hip.cpp         | 52 +++++++++++++++++++++++-
 omp/matrix/dense_kernels.cpp             | 25 ++++++++++++
 reference/matrix/dense_kernels.cpp       | 25 ++++++++++++
 7 files changed, 173 insertions(+), 7 deletions(-)

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 015c83cfdc3..2f7e7e9fe2a 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -257,7 +257,9 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_SUB_SCALED_DIAG_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 94a565ffa0c..aebcf96b9f6 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -79,7 +79,10 @@ GKO_REGISTER_OPERATION(sub_scaled_diag, dense::sub_scaled_diag);
 GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot);
 GKO_REGISTER_OPERATION(compute_dot_dispatch, dense::compute_dot_dispatch);
 GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot);
+GKO_REGISTER_OPERATION(compute_conj_dot_dispatch,
+                       dense::compute_conj_dot_dispatch);
 GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2);
+GKO_REGISTER_OPERATION(compute_norm2_dispatch, dense::compute_norm2_dispatch);
 GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1);
 GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row);
 GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs,
@@ -287,8 +290,8 @@ void Dense<ValueType>::compute_conj_dot_impl(const LinOp* b,
     auto exec = this->get_executor();
     auto dense_b = make_temporary_conversion<ValueType>(b);
     auto dense_res = make_temporary_conversion<ValueType>(result);
-    exec->run(
-        dense::make_compute_conj_dot(this, dense_b.get(), dense_res.get()));
+    exec->run(dense::make_compute_conj_dot_dispatch(this, dense_b.get(),
+                                                    dense_res.get()));
 }
 
 
@@ -299,7 +302,7 @@ void Dense<ValueType>::compute_norm2_impl(LinOp* result) const
     auto exec = this->get_executor();
     auto dense_res =
         make_temporary_conversion<remove_complex<ValueType>>(result);
-    exec->run(dense::make_compute_norm2(this, dense_res.get()));
+    exec->run(dense::make_compute_norm2_dispatch(this, dense_res.get()));
 }
 
 template <typename ValueType>
diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
index 45adb45ae71..26ac73462e4 100644
--- a/core/matrix/dense_kernels.hpp
+++ b/core/matrix/dense_kernels.hpp
@@ -116,6 +116,12 @@ namespace kernels {
                      const matrix::Dense<_type>* y,               \
                      matrix::Dense<_type>* result)
 
+#define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(_type)     \
+    void compute_conj_dot_dispatch(                                   \
+        std::shared_ptr<const DefaultExecutor> exec,                  \
+        const matrix::Dense<_type>* x, const matrix::Dense<_type>* y, \
+        matrix::Dense<_type>* result)
+
 #define GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(_type)               \
     void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec, \
                           const matrix::Dense<_type>* x,               \
@@ -127,6 +133,11 @@ namespace kernels {
                        const matrix::Dense<_type>* x,               \
                        matrix::Dense<remove_complex<_type>>* result)
 
+#define GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(_type)               \
+    void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec, \
+                                const matrix::Dense<_type>* x,               \
+                                matrix::Dense<remove_complex<_type>>* result)
+
 #define GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(_type)               \
     void compute_norm1(std::shared_ptr<const DefaultExecutor> exec, \
                        const matrix::Dense<_type>* x,               \
@@ -312,8 +323,12 @@ namespace kernels {
     template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL(ValueType);                   \
     template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL(ValueType);          \
+    template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL(ValueType);                      \
     template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL(ValueType);             \
+    template <typename ValueType>                                           \
     GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType);                      \
     template <typename ValueType, typename IndexType>                       \
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType);     \
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 0a40e840e84..08520cb063d 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -84,8 +84,8 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
             auto handle = exec->get_cublas_handle();
             cublas::pointer_mode_guard pm_guard(handle);
             cublas::dot(handle, x->get_size()[0], x->get_const_values(),
-                        x->get_size()[1], y->get_const_values(),
-                        y->get_size()[1], result->get_values());
+                        x->get_stride(), y->get_const_values(), y->get_stride(),
+                        result->get_values());
         } else {
             GKO_NOT_IMPLEMENTED;
         }
@@ -98,6 +98,54 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result)
+{
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (cublas::is_supported<ValueType>::value) {
+            auto handle = exec->get_cublas_handle();
+            cublas::pointer_mode_guard pm_guard(handle);
+            cublas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
+                             x->get_stride(), y->get_const_values(),
+                             y->get_stride(), result->get_values());
+        } else {
+            GKO_NOT_IMPLEMENTED;
+        }
+    } else {
+        compute_conj_dot(exec, x, y, result);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result)
+{
+    if (x->get_size()[1] == 1) {
+        if (cublas::is_supported<ValueType>::value) {
+            auto handle = exec->get_cublas_handle();
+            cublas::pointer_mode_guard pm_guard(handle);
+            cublas::norm2(handle, x->get_size()[0], x->get_const_values(),
+                          x->get_stride(), result->get_values());
+        } else {
+            GKO_NOT_IMPLEMENTED;
+        }
+    } else {
+        compute_norm2(exec, x, result);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index b7a22d1c87e..1c94bcab76a 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -87,8 +87,8 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
             auto handle = exec->get_hipblas_handle();
             hipblas::pointer_mode_guard pm_guard(handle);
             hipblas::dot(handle, x->get_size()[0], x->get_const_values(),
-                         x->get_size()[1], y->get_const_values(),
-                         y->get_size()[1], result->get_values());
+                         x->get_stride(), y->get_const_values(),
+                         y->get_stride(), result->get_values());
         } else {
             GKO_NOT_IMPLEMENTED;
         }
@@ -101,6 +101,54 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result)
+{
+    if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
+        if (hipblas::is_supported<ValueType>::value) {
+            auto handle = exec->get_hipblas_handle();
+            hipblas::pointer_mode_guard pm_guard(handle);
+            hipblas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
+                              x->get_stride(), y->get_const_values(),
+                              y->get_stride(), result->get_values());
+        } else {
+            GKO_NOT_IMPLEMENTED;
+        }
+    } else {
+        compute_conj_dot(exec, x, y, result);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result)
+{
+    if (x->get_size()[1] == 1) {
+        if (hipblas::is_supported<ValueType>::value) {
+            auto handle = exec->get_hipblas_handle();
+            hipblas::pointer_mode_guard pm_guard(handle);
+            hipblas::norm2(handle, x->get_size()[0], x->get_const_values(),
+                           x->get_stride(), result->get_values());
+        } else {
+            GKO_NOT_IMPLEMENTED;
+        }
+    } else {
+        compute_norm2(exec, x, result);
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/omp/matrix/dense_kernels.cpp b/omp/matrix/dense_kernels.cpp
index 924832456f7..6a7e3fa7b01 100644
--- a/omp/matrix/dense_kernels.cpp
+++ b/omp/matrix/dense_kernels.cpp
@@ -82,6 +82,31 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result)
+{
+    compute_conj_dot(exec, x, y, result);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result)
+{
+    compute_norm2(exec, x, result);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index 4cfc4246976..f0ffc237d9c 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -329,6 +329,19 @@ void compute_conj_dot(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_KERNEL);
 
 
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result)
+{
+    compute_conj_dot(exec, x, y, result);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void compute_norm2(std::shared_ptr<const ReferenceExecutor> exec,
                    const matrix::Dense<ValueType>* x,
@@ -350,6 +363,18 @@ void compute_norm2(std::shared_ptr<const ReferenceExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 
 
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result)
+{
+    compute_norm2(exec, x, result);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void compute_norm1(std::shared_ptr<const ReferenceExecutor> exec,
                    const matrix::Dense<ValueType>* x,

From 02ee7c60d71f8552d5ceea2060041e0fe9f1b4f2 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 11 Feb 2022 08:39:10 +0100
Subject: [PATCH 5/7] Fix dpcpp kernels and use unified everywhere else

---
 cuda/matrix/dense_kernels.cu      |  6 +++---
 dpcpp/matrix/dense_kernels.dp.cpp | 27 ++++++++++++++++++++++++++-
 hip/matrix/dense_kernels.hip.cpp  |  6 +++---
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 08520cb063d..060e4faa6fd 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -87,7 +87,7 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                         x->get_stride(), y->get_const_values(), y->get_stride(),
                         result->get_values());
         } else {
-            GKO_NOT_IMPLEMENTED;
+            compute_dot(exec, x, y, result);
         }
     } else {
         compute_dot(exec, x, y, result);
@@ -112,7 +112,7 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                              x->get_stride(), y->get_const_values(),
                              y->get_stride(), result->get_values());
         } else {
-            GKO_NOT_IMPLEMENTED;
+            compute_conj_dot(exec, x, y, result);
         }
     } else {
         compute_conj_dot(exec, x, y, result);
@@ -135,7 +135,7 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
             cublas::norm2(handle, x->get_size()[0], x->get_const_values(),
                           x->get_stride(), result->get_values());
         } else {
-            GKO_NOT_IMPLEMENTED;
+            compute_norm2(exec, x, result);
         }
     } else {
         compute_norm2(exec, x, result);
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 705155b0ea8..05d0ecb1b51 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -195,7 +195,7 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                           const matrix::Dense<ValueType>* y,
                           matrix::Dense<ValueType>* result)
 {
-    // TODO Add onemkl for single column
+    // TODO Add onemkl for single column ?
     compute_dot(exec, x, y, result);
 }
 
@@ -203,6 +203,31 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COMPUTE_DOT_DISPATCH_KERNEL);
 
 
+template <typename ValueType>
+void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                               const matrix::Dense<ValueType>* x,
+                               const matrix::Dense<ValueType>* y,
+                               matrix::Dense<ValueType>* result)
+{
+    compute_conj_dot(exec, x, y, result);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
+
+
+template <typename ValueType>
+void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
+                            const matrix::Dense<ValueType>* x,
+                            matrix::Dense<remove_complex<ValueType>>* result)
+{
+    compute_norm2(exec, x, result);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
+
+
 template <typename ValueType>
 void simple_apply(std::shared_ptr<const DefaultExecutor> exec,
                   const matrix::Dense<ValueType>* a,
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 1c94bcab76a..53fd9e4d284 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -90,7 +90,7 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                          x->get_stride(), y->get_const_values(),
                          y->get_stride(), result->get_values());
         } else {
-            GKO_NOT_IMPLEMENTED;
+            compute_dot(exec, x, y, result);
         }
     } else {
         compute_dot(exec, x, y, result);
@@ -115,7 +115,7 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                               x->get_stride(), y->get_const_values(),
                               y->get_stride(), result->get_values());
         } else {
-            GKO_NOT_IMPLEMENTED;
+            compute_conj_dot(exec, x, y, result);
         }
     } else {
         compute_conj_dot(exec, x, y, result);
@@ -138,7 +138,7 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
             hipblas::norm2(handle, x->get_size()[0], x->get_const_values(),
                            x->get_stride(), result->get_values());
         } else {
-            GKO_NOT_IMPLEMENTED;
+            compute_norm2(exec, x, result);
         }
     } else {
         compute_norm2(exec, x, result);

From 3287119b05934c0d075acdd73909280df1a82c15 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 11 Feb 2022 14:15:47 +0100
Subject: [PATCH 6/7] Fix issues with PM guards

---
 cuda/matrix/dense_kernels.cu     | 3 ---
 hip/matrix/dense_kernels.hip.cpp | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 060e4faa6fd..0d39cbb9fbc 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -82,7 +82,6 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
         if (cublas::is_supported<ValueType>::value) {
             auto handle = exec->get_cublas_handle();
-            cublas::pointer_mode_guard pm_guard(handle);
             cublas::dot(handle, x->get_size()[0], x->get_const_values(),
                         x->get_stride(), y->get_const_values(), y->get_stride(),
                         result->get_values());
@@ -107,7 +106,6 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
         if (cublas::is_supported<ValueType>::value) {
             auto handle = exec->get_cublas_handle();
-            cublas::pointer_mode_guard pm_guard(handle);
             cublas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
                              x->get_stride(), y->get_const_values(),
                              y->get_stride(), result->get_values());
@@ -131,7 +129,6 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     if (x->get_size()[1] == 1) {
         if (cublas::is_supported<ValueType>::value) {
             auto handle = exec->get_cublas_handle();
-            cublas::pointer_mode_guard pm_guard(handle);
             cublas::norm2(handle, x->get_size()[0], x->get_const_values(),
                           x->get_stride(), result->get_values());
         } else {
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 53fd9e4d284..d883f415770 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -85,7 +85,6 @@ void compute_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
         if (hipblas::is_supported<ValueType>::value) {
             auto handle = exec->get_hipblas_handle();
-            hipblas::pointer_mode_guard pm_guard(handle);
             hipblas::dot(handle, x->get_size()[0], x->get_const_values(),
                          x->get_stride(), y->get_const_values(),
                          y->get_stride(), result->get_values());
@@ -110,7 +109,6 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     if (x->get_size()[1] == 1 && y->get_size()[1] == 1) {
         if (hipblas::is_supported<ValueType>::value) {
             auto handle = exec->get_hipblas_handle();
-            hipblas::pointer_mode_guard pm_guard(handle);
             hipblas::conj_dot(handle, x->get_size()[0], x->get_const_values(),
                               x->get_stride(), y->get_const_values(),
                               y->get_stride(), result->get_values());
@@ -134,7 +132,6 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
     if (x->get_size()[1] == 1) {
         if (hipblas::is_supported<ValueType>::value) {
             auto handle = exec->get_hipblas_handle();
-            hipblas::pointer_mode_guard pm_guard(handle);
             hipblas::norm2(handle, x->get_size()[0], x->get_const_values(),
                            x->get_stride(), result->get_values());
         } else {

From 3ea6e0cb31f4fdb4c578740006b0896be7717374 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Fri, 11 Feb 2022 17:27:16 +0100
Subject: [PATCH 7/7] Add single vector nrm2 tests

---
 cuda/test/matrix/dense_kernels.cpp    | 16 +++++++++++++++-
 dpcpp/matrix/dense_kernels.dp.cpp     |  2 ++
 dpcpp/test/matrix/dense_kernels.cpp   | 16 +++++++++++++++-
 hip/test/matrix/dense_kernels.hip.cpp | 16 +++++++++++++++-
 omp/test/matrix/dense_kernels.cpp     | 16 +++++++++++++++-
 5 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/cuda/test/matrix/dense_kernels.cpp b/cuda/test/matrix/dense_kernels.cpp
index a45274cdb9b..7d292a8b5fa 100644
--- a/cuda/test/matrix/dense_kernels.cpp
+++ b/cuda/test/matrix/dense_kernels.cpp
@@ -226,7 +226,21 @@ TEST_F(Dense, MultipleVectorCudaComputeConjDotIsEquivalentToRef)
 }
 
 
-TEST_F(Dense, CudaComputeNorm2IsEquivalentToRef)
+TEST_F(Dense, SingleVectorCudaComputeNorm2IsEquivalentToRef)
+{
+    set_up_vector_data(1);
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->cuda, norm_size);
+
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorCudaComputeNorm2IsEquivalentToRef)
 {
     set_up_vector_data(20);
     auto norm_size = gko::dim<2>{1, x->get_size()[1]};
diff --git a/dpcpp/matrix/dense_kernels.dp.cpp b/dpcpp/matrix/dense_kernels.dp.cpp
index 05d0ecb1b51..2a7120ffa0b 100644
--- a/dpcpp/matrix/dense_kernels.dp.cpp
+++ b/dpcpp/matrix/dense_kernels.dp.cpp
@@ -209,6 +209,7 @@ void compute_conj_dot_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                                const matrix::Dense<ValueType>* y,
                                matrix::Dense<ValueType>* result)
 {
+    // TODO Add onemkl for single column ?
     compute_conj_dot(exec, x, y, result);
 }
 
@@ -221,6 +222,7 @@ void compute_norm2_dispatch(std::shared_ptr<const DefaultExecutor> exec,
                             const matrix::Dense<ValueType>* x,
                             matrix::Dense<remove_complex<ValueType>>* result)
 {
+    // TODO Add onemkl for single column ?
     compute_norm2(exec, x, result);
 }
 
diff --git a/dpcpp/test/matrix/dense_kernels.cpp b/dpcpp/test/matrix/dense_kernels.cpp
index a426c88116f..56ef8b5532e 100644
--- a/dpcpp/test/matrix/dense_kernels.cpp
+++ b/dpcpp/test/matrix/dense_kernels.cpp
@@ -230,7 +230,21 @@ TEST_F(Dense, MultipleVectorDpcppComputeConjDotIsEquivalentToRef)
 }
 
 
-TEST_F(Dense, DpcppComputeNorm2IsEquivalentToRef)
+TEST_F(Dense, SingleVectorDpcppComputeNorm2IsEquivalentToRef)
+{
+    set_up_vector_data(1);
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->dpcpp, norm_size);
+
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, r<vtype>::value);
+}
+
+
+TEST_F(Dense, MultipleVectorDpcppComputeNorm2IsEquivalentToRef)
 {
     set_up_vector_data(20);
     auto norm_size = gko::dim<2>{1, x->get_size()[1]};
diff --git a/hip/test/matrix/dense_kernels.hip.cpp b/hip/test/matrix/dense_kernels.hip.cpp
index 894b5fcc408..a6ae2df195b 100644
--- a/hip/test/matrix/dense_kernels.hip.cpp
+++ b/hip/test/matrix/dense_kernels.hip.cpp
@@ -222,7 +222,21 @@ TEST_F(Dense, MultipleVectorHipComputeConjDotIsEquivalentToRef)
 }
 
 
-TEST_F(Dense, HipComputeNorm2IsEquivalentToRef)
+TEST_F(Dense, SingleHipComputeNorm2IsEquivalentToRef)
+{
+    set_up_vector_data(1);
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->hip, norm_size);
+
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleHipComputeNorm2IsEquivalentToRef)
 {
     set_up_vector_data(20);
     auto norm_size = gko::dim<2>{1, x->get_size()[1]};
diff --git a/omp/test/matrix/dense_kernels.cpp b/omp/test/matrix/dense_kernels.cpp
index 9d8db3975b9..7b8c4ac365b 100644
--- a/omp/test/matrix/dense_kernels.cpp
+++ b/omp/test/matrix/dense_kernels.cpp
@@ -239,7 +239,21 @@ TEST_F(Dense, MultipleVectorOmpComputeConjDotIsEquivalentToRef)
 }
 
 
-TEST_F(Dense, ComputesNorm2IsEquivalentToRef)
+TEST_F(Dense, SingleVectorComputesNorm2IsEquivalentToRef)
+{
+    set_up_vector_data(1);
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(this->ref, norm_size);
+    auto dnorm = NormVector::create(this->omp, norm_size);
+
+    x->compute_norm2(norm_expected.get());
+    dx->compute_norm2(dnorm.get());
+
+    GKO_ASSERT_MTX_NEAR(norm_expected, dnorm, 1e-14);
+}
+
+
+TEST_F(Dense, MultipleVectorComputesNorm2IsEquivalentToRef)
 {
     set_up_vector_data(20);
     auto norm_size = gko::dim<2>{1, x->get_size()[1]};