Add algorithm choice for triangular solvers

ginkgo-project · Jul 27, 2022 · 0727a35 · 0727a35
1 parent fb1672d
commit 0727a35
Show file tree

Hide file tree

Showing 19 changed files with 90 additions and 42 deletions.
diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp
@@ -136,7 +136,8 @@ void LowerTrs<ValueType, IndexType>::generate()
     if (this->get_system_matrix()) {
         this->get_executor()->run(lower_trs::make_generate(
             this->get_system_matrix().get(), this->solve_struct_,
-            this->get_parameters().unit_diagonal, parameters_.num_rhs));
+            this->get_parameters().unit_diagonal, parameters_.algorithm,
+            parameters_.num_rhs));
     }
 }
 

diff --git a/core/solver/lower_trs_kernels.hpp b/core/solver/lower_trs_kernels.hpp
@@ -58,11 +58,12 @@ namespace lower_trs {
                                   bool& do_transpose)
 
 
-#define GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL(_vtype, _itype)         \
-    void generate(std::shared_ptr<const DefaultExecutor> exec,        \
-                  const matrix::Csr<_vtype, _itype>* matrix,          \
-                  std::shared_ptr<solver::SolveStruct>& solve_struct, \
-                  bool unit_diag, const gko::size_type num_rhs)
+#define GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL(_vtype, _itype)                 \
+    void generate(std::shared_ptr<const DefaultExecutor> exec,                \
+                  const matrix::Csr<_vtype, _itype>* matrix,                  \
+                  std::shared_ptr<solver::SolveStruct>& solve_struct,         \
+                  bool unit_diag, const solver::trisolve_algorithm algorithm, \
+                  const gko::size_type num_rhs)
 
 
 #define GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL(_vtype, _itype)                     \

diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp
@@ -136,7 +136,8 @@ void UpperTrs<ValueType, IndexType>::generate()
     if (this->get_system_matrix()) {
         this->get_executor()->run(upper_trs::make_generate(
             this->get_system_matrix().get(), this->solve_struct_,
-            this->get_parameters().unit_diagonal, parameters_.num_rhs));
+            this->get_parameters().unit_diagonal, parameters_.algorithm,
+            parameters_.num_rhs));
     }
 }
 

diff --git a/core/solver/upper_trs_kernels.hpp b/core/solver/upper_trs_kernels.hpp
@@ -58,11 +58,12 @@ namespace upper_trs {
                                   bool& do_transpose)
 
 
-#define GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL(_vtype, _itype)              \
-    void generate(std::shared_ptr<const DefaultExecutor> exec,             \
-                  const matrix::Csr<_vtype, _itype>* matrix,               \
-                  std::shared_ptr<gko::solver::SolveStruct>& solve_struct, \
-                  bool unit_diag, const gko::size_type num_rhs)
+#define GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL(_vtype, _itype)                 \
+    void generate(std::shared_ptr<const DefaultExecutor> exec,                \
+                  const matrix::Csr<_vtype, _itype>* matrix,                  \
+                  std::shared_ptr<gko::solver::SolveStruct>& solve_struct,    \
+                  bool unit_diag, const solver::trisolve_algorithm algorithm, \
+                  const gko::size_type num_rhs)
 
 
 #define GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL(_vtype, _itype)                     \

diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
@@ -73,9 +73,10 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const CudaExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              bool unit_diag, const gko::size_type num_rhs)
+              bool unit_diag, const solver::trisolve_algorithm algorithm,
+              const gko::size_type num_rhs)
 {
-    if (matrix->get_strategy()->get_name() == "sparselib") {
+    if (algorithm == solver::trisolve_algorithm::sparselib) {
         generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct,
                                               num_rhs, false, unit_diag);
     }

diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu
@@ -73,9 +73,10 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const CudaExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              bool unit_diag, const gko::size_type num_rhs)
+              bool unit_diag, const solver::trisolve_algorithm algorithm,
+              const gko::size_type num_rhs)
 {
-    if (matrix->get_strategy()->get_name() == "sparselib") {
+    if (algorithm == solver::trisolve_algorithm::sparselib) {
         generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct,
                                               num_rhs, true, unit_diag);
     }

diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cpp
@@ -133,12 +133,14 @@ TEST_F(LowerTrs, CudaLowerTrsFlagCheckIsCorrect)
 }
 
 
-TEST_F(LowerTrs, CudaSingleRhsApplyClassicalIsEquivalentToRef)
+TEST_F(LowerTrs, CudaSingleRhsApplySparselibIsEquivalentToRef)
 {
     initialize_data(50, 1);
     auto lower_trs_factory = gko::solver::LowerTrs<>::build().on(ref);
-    auto d_lower_trs_factory = gko::solver::LowerTrs<>::build().on(cuda);
-    d_csr_mtx->set_strategy(std::make_shared<CsrMtx::classical>());
+    auto d_lower_trs_factory =
+        gko::solver::LowerTrs<>::build()
+            .with_algorithm(gko::solver::trisolve_algorithm::sparselib)
+            .on(cuda);
     auto solver = lower_trs_factory->generate(csr_mtx);
     auto d_solver = d_lower_trs_factory->generate(d_csr_mtx);
 
@@ -164,19 +166,27 @@ TEST_F(LowerTrs, CudaSingleRhsApplyIsEquivalentToRef)
 }
 
 
-TEST_F(LowerTrs, CudaMultipleRhsApplyClassicalIsEquivalentToRef)
+TEST_F(LowerTrs, CudaMultipleRhsApplySparselibIsEquivalentToRef)
 {
     initialize_data(50, 3);
     auto lower_trs_factory =
         gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(ref);
     auto d_lower_trs_factory =
-        gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(cuda);
-    d_csr_mtx->set_strategy(std::make_shared<CsrMtx::classical>());
+        gko::solver::LowerTrs<>::build()
+            .with_algorithm(gko::solver::trisolve_algorithm::sparselib)
+            .with_num_rhs(3u)
+            .on(cuda);
     auto solver = lower_trs_factory->generate(csr_mtx);
     auto d_solver = d_lower_trs_factory->generate(d_csr_mtx);
     auto db2_strided = Mtx::create(cuda, b->get_size(), 4);
     d_b2->convert_to(db2_strided.get());
+    // The cuSPARSE Generic SpSM implementation uses the wrong stride here
+    // so the input and output stride need to match
+#if CUDA_VERSION >= 11031
+    auto dx_strided = Mtx::create(cuda, x->get_size(), 4);
+#else
     auto dx_strided = Mtx::create(cuda, x->get_size(), 5);
+#endif
 
     solver->apply(b2.get(), x.get());
     d_solver->apply(db2_strided.get(), dx_strided.get());
@@ -196,13 +206,7 @@ TEST_F(LowerTrs, CudaMultipleRhsApplyIsEquivalentToRef)
     auto d_solver = d_lower_trs_factory->generate(d_csr_mtx);
     auto db2_strided = Mtx::create(cuda, b->get_size(), 4);
     d_b2->convert_to(db2_strided.get());
-    // The cuSPARSE Generic SpSM implementation uses the wrong stride here
-    // so the input and output stride need to match
-#if CUDA_VERSION >= 11031
-    auto dx_strided = Mtx::create(cuda, x->get_size(), 4);
-#else
     auto dx_strided = Mtx::create(cuda, x->get_size(), 5);
-#endif
 
     solver->apply(b2.get(), x.get());
     d_solver->apply(db2_strided.get(), dx_strided.get());

diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp
@@ -133,12 +133,14 @@ TEST_F(UpperTrs, CudaUpperTrsFlagCheckIsCorrect)
 }
 
 
-TEST_F(UpperTrs, CudaSingleRhsApplyClassicalIsEquivalentToRef)
+TEST_F(UpperTrs, CudaSingleRhsApplySparselibIsEquivalentToRef)
 {
     initialize_data(50, 1);
     auto upper_trs_factory = gko::solver::UpperTrs<>::build().on(ref);
-    auto d_upper_trs_factory = gko::solver::UpperTrs<>::build().on(cuda);
-    d_csr_mtx->set_strategy(std::make_shared<CsrMtx::classical>());
+    auto d_upper_trs_factory =
+        gko::solver::UpperTrs<>::build()
+            .with_algorithm(gko::solver::trisolve_algorithm::sparselib)
+            .on(cuda);
     auto solver = upper_trs_factory->generate(csr_mtx);
     auto d_solver = d_upper_trs_factory->generate(d_csr_mtx);
 
@@ -164,14 +166,16 @@ TEST_F(UpperTrs, CudaSingleRhsApplyIsEquivalentToRef)
 }
 
 
-TEST_F(UpperTrs, CudaMultipleRhsApplyClassicalIsEquivalentToRef)
+TEST_F(UpperTrs, CudaMultipleRhsApplySparselibIsEquivalentToRef)
 {
     initialize_data(50, 3);
     auto upper_trs_factory =
         gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(ref);
     auto d_upper_trs_factory =
-        gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(cuda);
-    d_csr_mtx->set_strategy(std::make_shared<CsrMtx::classical>());
+        gko::solver::UpperTrs<>::build()
+            .with_algorithm(gko::solver::trisolve_algorithm::sparselib)
+            .with_num_rhs(3u)
+            .on(cuda);
     auto solver = upper_trs_factory->generate(csr_mtx);
     auto d_solver = d_upper_trs_factory->generate(d_csr_mtx);
     auto db2_strided = Mtx::create(cuda, b->get_size(), 4);

diff --git a/dpcpp/solver/lower_trs_kernels.dp.cpp b/dpcpp/solver/lower_trs_kernels.dp.cpp
@@ -70,7 +70,8 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const DpcppExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              bool unit_diag, const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
+              bool unit_diag, const solver::trisolve_algorithm algorithm,
+              const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);

diff --git a/dpcpp/solver/upper_trs_kernels.dp.cpp b/dpcpp/solver/upper_trs_kernels.dp.cpp
@@ -70,7 +70,8 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const DpcppExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              bool unit_diag, const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
+              bool unit_diag, const solver::trisolve_algorithm algorithm,
+              const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);

diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp
@@ -73,7 +73,8 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const HipExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              bool unit_diag, const gko::size_type num_rhs)
+              bool unit_diag, const solver::trisolve_algorithm algorithm,
+              const gko::size_type num_rhs)
 {
     generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct, num_rhs,
                                           false, unit_diag);

diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp
@@ -73,7 +73,8 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const HipExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              bool unit_diag, const gko::size_type num_rhs)
+              bool unit_diag, const solver::trisolve_algorithm algorithm,
+              const gko::size_type num_rhs)
 {
     generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct, num_rhs,
                                           true, unit_diag);

diff --git a/include/ginkgo/core/solver/lower_trs.hpp b/include/ginkgo/core/solver/lower_trs.hpp
@@ -116,6 +116,15 @@ class LowerTrs : public EnableLinOp<LowerTrs<ValueType, IndexType>>,
          * (false) or should it assume they are 1.0 (true)?
          */
         bool GKO_FACTORY_PARAMETER_SCALAR(unit_diagonal, false);
+
+        /**
+         * Select the implementation which is supposed to be used for
+         * the triangular solver. This only matters for the Cuda
+         * executor where the choice is between the Ginkgo and the
+         * Cusparse implementation. Default is Ginkgo.
+         */
+        trisolve_algorithm GKO_FACTORY_PARAMETER_SCALAR(
+            algorithm, trisolve_algorithm::ginkgo);
     };
     GKO_ENABLE_LIN_OP_FACTORY(LowerTrs, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);

diff --git a/include/ginkgo/core/solver/solver_base.hpp b/include/ginkgo/core/solver/solver_base.hpp
@@ -590,6 +590,14 @@ class EnablePreconditionedIterativeSolver
 };
 
 
+/**
+ * A helper for algorithm selection in the triangular solvers.
+ * It currently only matters for the Cuda executor as there,
+ * we have a choice between the Ginkgo and Cusparse implementations.
+ */
+enum class trisolve_algorithm { sparselib, ginkgo };
+
+
 }  // namespace solver
 }  // namespace gko
 

diff --git a/include/ginkgo/core/solver/upper_trs.hpp b/include/ginkgo/core/solver/upper_trs.hpp
@@ -116,6 +116,15 @@ class UpperTrs : public EnableLinOp<UpperTrs<ValueType, IndexType>>,
          * (false) or should it assume they are 1.0 (true)?
          */
         bool GKO_FACTORY_PARAMETER_SCALAR(unit_diagonal, false);
+
+        /**
+         * Select the implementation which is supposed to be used for
+         * the triangular solver. This only matters for the Cuda
+         * executor where the choice is between the Ginkgo and the
+         * Cusparse implementation. Default is Ginkgo.
+         */
+        trisolve_algorithm GKO_FACTORY_PARAMETER_SCALAR(
+            algorithm, trisolve_algorithm::ginkgo);
     };
     GKO_ENABLE_LIN_OP_FACTORY(UpperTrs, parameters, Factory);
     GKO_ENABLE_BUILD_METHOD(Factory);

diff --git a/omp/solver/lower_trs_kernels.cpp b/omp/solver/lower_trs_kernels.cpp
@@ -70,7 +70,8 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const OmpExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              bool unit_diag, const gko::size_type num_rhs)
+              bool unit_diag, const solver::trisolve_algorithm algorithm,
+              const gko::size_type num_rhs)
 {
     // This generate kernel is here to allow for a more sophisticated
     // implementation as for other executors. This kernel would perform the

diff --git a/omp/solver/upper_trs_kernels.cpp b/omp/solver/upper_trs_kernels.cpp
@@ -70,7 +70,8 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const OmpExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              bool unit_diag, const gko::size_type num_rhs)
+              bool unit_diag, const solver::trisolve_algorithm algorithm,
+              const gko::size_type num_rhs)
 {
     // This generate kernel is here to allow for a more sophisticated
     // implementation as for other executors. This kernel would perform the

diff --git a/reference/solver/lower_trs_kernels.cpp b/reference/solver/lower_trs_kernels.cpp
@@ -66,7 +66,8 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const ReferenceExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              bool unit_diag, const gko::size_type num_rhs)
+              bool unit_diag, const solver::trisolve_algorithm algorithm,
+              const gko::size_type num_rhs)
 {
     // This generate kernel is here to allow for a more sophisticated
     // implementation as for other executors. This kernel would perform the

diff --git a/reference/solver/upper_trs_kernels.cpp b/reference/solver/upper_trs_kernels.cpp
@@ -66,7 +66,8 @@ template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const ReferenceExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
-              bool unit_diag, const gko::size_type num_rhs)
+              bool unit_diag, const solver::trisolve_algorithm algorithm,
+              const gko::size_type num_rhs)
 {
     // This generate kernel is here to allow for a more sophisticated
     // implementation as for other executors. This kernel would perform the