From 0727a35e1e04c39f4ed13c085ed0fe730def59b3 Mon Sep 17 00:00:00 2001 From: Fritz Goebel Date: Wed, 27 Jul 2022 06:09:41 -0400 Subject: [PATCH] Add algorithm choice for triangular solvers --- core/solver/lower_trs.cpp | 3 ++- core/solver/lower_trs_kernels.hpp | 11 +++++---- core/solver/upper_trs.cpp | 3 ++- core/solver/upper_trs_kernels.hpp | 11 +++++---- cuda/solver/lower_trs_kernels.cu | 5 ++-- cuda/solver/upper_trs_kernels.cu | 5 ++-- cuda/test/solver/lower_trs_kernels.cpp | 28 ++++++++++++---------- cuda/test/solver/upper_trs_kernels.cpp | 16 ++++++++----- dpcpp/solver/lower_trs_kernels.dp.cpp | 3 ++- dpcpp/solver/upper_trs_kernels.dp.cpp | 3 ++- hip/solver/lower_trs_kernels.hip.cpp | 3 ++- hip/solver/upper_trs_kernels.hip.cpp | 3 ++- include/ginkgo/core/solver/lower_trs.hpp | 9 +++++++ include/ginkgo/core/solver/solver_base.hpp | 8 +++++++ include/ginkgo/core/solver/upper_trs.hpp | 9 +++++++ omp/solver/lower_trs_kernels.cpp | 3 ++- omp/solver/upper_trs_kernels.cpp | 3 ++- reference/solver/lower_trs_kernels.cpp | 3 ++- reference/solver/upper_trs_kernels.cpp | 3 ++- 19 files changed, 90 insertions(+), 42 deletions(-) diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp index ca088fec5ec..75878537c8b 100644 --- a/core/solver/lower_trs.cpp +++ b/core/solver/lower_trs.cpp @@ -136,7 +136,8 @@ void LowerTrs::generate() if (this->get_system_matrix()) { this->get_executor()->run(lower_trs::make_generate( this->get_system_matrix().get(), this->solve_struct_, - this->get_parameters().unit_diagonal, parameters_.num_rhs)); + this->get_parameters().unit_diagonal, parameters_.algorithm, + parameters_.num_rhs)); } } diff --git a/core/solver/lower_trs_kernels.hpp b/core/solver/lower_trs_kernels.hpp index a2726816736..dfeb298118b 100644 --- a/core/solver/lower_trs_kernels.hpp +++ b/core/solver/lower_trs_kernels.hpp @@ -58,11 +58,12 @@ namespace lower_trs { bool& do_transpose) -#define GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL(_vtype, _itype) \ - void generate(std::shared_ptr exec, \ - const matrix::Csr<_vtype, _itype>* matrix, \ - std::shared_ptr& solve_struct, \ - bool unit_diag, const gko::size_type num_rhs) +#define GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL(_vtype, _itype) \ + void generate(std::shared_ptr exec, \ + const matrix::Csr<_vtype, _itype>* matrix, \ + std::shared_ptr& solve_struct, \ + bool unit_diag, const solver::trisolve_algorithm algorithm, \ + const gko::size_type num_rhs) #define GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL(_vtype, _itype) \ diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp index e8190d2eb8b..c999b14e9e0 100644 --- a/core/solver/upper_trs.cpp +++ b/core/solver/upper_trs.cpp @@ -136,7 +136,8 @@ void UpperTrs::generate() if (this->get_system_matrix()) { this->get_executor()->run(upper_trs::make_generate( this->get_system_matrix().get(), this->solve_struct_, - this->get_parameters().unit_diagonal, parameters_.num_rhs)); + this->get_parameters().unit_diagonal, parameters_.algorithm, + parameters_.num_rhs)); } } diff --git a/core/solver/upper_trs_kernels.hpp b/core/solver/upper_trs_kernels.hpp index bd470213ad1..63e79b5009a 100644 --- a/core/solver/upper_trs_kernels.hpp +++ b/core/solver/upper_trs_kernels.hpp @@ -58,11 +58,12 @@ namespace upper_trs { bool& do_transpose) -#define GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL(_vtype, _itype) \ - void generate(std::shared_ptr exec, \ - const matrix::Csr<_vtype, _itype>* matrix, \ - std::shared_ptr& solve_struct, \ - bool unit_diag, const gko::size_type num_rhs) +#define GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL(_vtype, _itype) \ + void generate(std::shared_ptr exec, \ + const matrix::Csr<_vtype, _itype>* matrix, \ + std::shared_ptr& solve_struct, \ + bool unit_diag, const solver::trisolve_algorithm algorithm, \ + const gko::size_type num_rhs) #define GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL(_vtype, _itype) \ diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu index d7ea091e6b9..ff40d94106c 100644 --- a/cuda/solver/lower_trs_kernels.cu +++ b/cuda/solver/lower_trs_kernels.cu @@ -73,9 +73,10 @@ template void generate(std::shared_ptr exec, const matrix::Csr* matrix, std::shared_ptr& solve_struct, - bool unit_diag, const gko::size_type num_rhs) + bool unit_diag, const solver::trisolve_algorithm algorithm, + const gko::size_type num_rhs) { - if (matrix->get_strategy()->get_name() == "sparselib") { + if (algorithm == solver::trisolve_algorithm::sparselib) { generate_kernel(exec, matrix, solve_struct, num_rhs, false, unit_diag); } diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu index b6229863318..8f41c95f506 100644 --- a/cuda/solver/upper_trs_kernels.cu +++ b/cuda/solver/upper_trs_kernels.cu @@ -73,9 +73,10 @@ template void generate(std::shared_ptr exec, const matrix::Csr* matrix, std::shared_ptr& solve_struct, - bool unit_diag, const gko::size_type num_rhs) + bool unit_diag, const solver::trisolve_algorithm algorithm, + const gko::size_type num_rhs) { - if (matrix->get_strategy()->get_name() == "sparselib") { + if (algorithm == solver::trisolve_algorithm::sparselib) { generate_kernel(exec, matrix, solve_struct, num_rhs, true, unit_diag); } diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cpp index c8b7f323c4b..38fd6945939 100644 --- a/cuda/test/solver/lower_trs_kernels.cpp +++ b/cuda/test/solver/lower_trs_kernels.cpp @@ -133,12 +133,14 @@ TEST_F(LowerTrs, CudaLowerTrsFlagCheckIsCorrect) } -TEST_F(LowerTrs, CudaSingleRhsApplyClassicalIsEquivalentToRef) +TEST_F(LowerTrs, CudaSingleRhsApplySparselibIsEquivalentToRef) { initialize_data(50, 1); auto lower_trs_factory = gko::solver::LowerTrs<>::build().on(ref); - auto d_lower_trs_factory = gko::solver::LowerTrs<>::build().on(cuda); - d_csr_mtx->set_strategy(std::make_shared()); + auto d_lower_trs_factory = + gko::solver::LowerTrs<>::build() + .with_algorithm(gko::solver::trisolve_algorithm::sparselib) + .on(cuda); auto solver = lower_trs_factory->generate(csr_mtx); auto d_solver = d_lower_trs_factory->generate(d_csr_mtx); @@ -164,19 +166,27 @@ TEST_F(LowerTrs, CudaSingleRhsApplyIsEquivalentToRef) } -TEST_F(LowerTrs, CudaMultipleRhsApplyClassicalIsEquivalentToRef) +TEST_F(LowerTrs, CudaMultipleRhsApplySparselibIsEquivalentToRef) { initialize_data(50, 3); auto lower_trs_factory = gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(ref); auto d_lower_trs_factory = - gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(cuda); - d_csr_mtx->set_strategy(std::make_shared()); + gko::solver::LowerTrs<>::build() + .with_algorithm(gko::solver::trisolve_algorithm::sparselib) + .with_num_rhs(3u) + .on(cuda); auto solver = lower_trs_factory->generate(csr_mtx); auto d_solver = d_lower_trs_factory->generate(d_csr_mtx); auto db2_strided = Mtx::create(cuda, b->get_size(), 4); d_b2->convert_to(db2_strided.get()); + // The cuSPARSE Generic SpSM implementation uses the wrong stride here + // so the input and output stride need to match +#if CUDA_VERSION >= 11031 + auto dx_strided = Mtx::create(cuda, x->get_size(), 4); +#else auto dx_strided = Mtx::create(cuda, x->get_size(), 5); +#endif solver->apply(b2.get(), x.get()); d_solver->apply(db2_strided.get(), dx_strided.get()); @@ -196,13 +206,7 @@ TEST_F(LowerTrs, CudaMultipleRhsApplyIsEquivalentToRef) auto d_solver = d_lower_trs_factory->generate(d_csr_mtx); auto db2_strided = Mtx::create(cuda, b->get_size(), 4); d_b2->convert_to(db2_strided.get()); - // The cuSPARSE Generic SpSM implementation uses the wrong stride here - // so the input and output stride need to match -#if CUDA_VERSION >= 11031 - auto dx_strided = Mtx::create(cuda, x->get_size(), 4); -#else auto dx_strided = Mtx::create(cuda, x->get_size(), 5); -#endif solver->apply(b2.get(), x.get()); d_solver->apply(db2_strided.get(), dx_strided.get()); diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp index d9a810032de..3e0ddbc34ed 100644 --- a/cuda/test/solver/upper_trs_kernels.cpp +++ b/cuda/test/solver/upper_trs_kernels.cpp @@ -133,12 +133,14 @@ TEST_F(UpperTrs, CudaUpperTrsFlagCheckIsCorrect) } -TEST_F(UpperTrs, CudaSingleRhsApplyClassicalIsEquivalentToRef) +TEST_F(UpperTrs, CudaSingleRhsApplySparselibIsEquivalentToRef) { initialize_data(50, 1); auto upper_trs_factory = gko::solver::UpperTrs<>::build().on(ref); - auto d_upper_trs_factory = gko::solver::UpperTrs<>::build().on(cuda); - d_csr_mtx->set_strategy(std::make_shared()); + auto d_upper_trs_factory = + gko::solver::UpperTrs<>::build() + .with_algorithm(gko::solver::trisolve_algorithm::sparselib) + .on(cuda); auto solver = upper_trs_factory->generate(csr_mtx); auto d_solver = d_upper_trs_factory->generate(d_csr_mtx); @@ -164,14 +166,16 @@ TEST_F(UpperTrs, CudaSingleRhsApplyIsEquivalentToRef) } -TEST_F(UpperTrs, CudaMultipleRhsApplyClassicalIsEquivalentToRef) +TEST_F(UpperTrs, CudaMultipleRhsApplySparselibIsEquivalentToRef) { initialize_data(50, 3); auto upper_trs_factory = gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(ref); auto d_upper_trs_factory = - gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(cuda); - d_csr_mtx->set_strategy(std::make_shared()); + gko::solver::UpperTrs<>::build() + .with_algorithm(gko::solver::trisolve_algorithm::sparselib) + .with_num_rhs(3u) + .on(cuda); auto solver = upper_trs_factory->generate(csr_mtx); auto d_solver = d_upper_trs_factory->generate(d_csr_mtx); auto db2_strided = Mtx::create(cuda, b->get_size(), 4); diff --git a/dpcpp/solver/lower_trs_kernels.dp.cpp b/dpcpp/solver/lower_trs_kernels.dp.cpp index f24acf16c0c..d6bfcf215b3 100644 --- a/dpcpp/solver/lower_trs_kernels.dp.cpp +++ b/dpcpp/solver/lower_trs_kernels.dp.cpp @@ -70,7 +70,8 @@ template void generate(std::shared_ptr exec, const matrix::Csr* matrix, std::shared_ptr& solve_struct, - bool unit_diag, const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED; + bool unit_diag, const solver::trisolve_algorithm algorithm, + const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL); diff --git a/dpcpp/solver/upper_trs_kernels.dp.cpp b/dpcpp/solver/upper_trs_kernels.dp.cpp index 783bac9577b..5ff662065d7 100644 --- a/dpcpp/solver/upper_trs_kernels.dp.cpp +++ b/dpcpp/solver/upper_trs_kernels.dp.cpp @@ -70,7 +70,8 @@ template void generate(std::shared_ptr exec, const matrix::Csr* matrix, std::shared_ptr& solve_struct, - bool unit_diag, const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED; + bool unit_diag, const solver::trisolve_algorithm algorithm, + const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED; GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL); diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp index 42606c0aab5..0785f3f75b9 100644 --- a/hip/solver/lower_trs_kernels.hip.cpp +++ b/hip/solver/lower_trs_kernels.hip.cpp @@ -73,7 +73,8 @@ template void generate(std::shared_ptr exec, const matrix::Csr* matrix, std::shared_ptr& solve_struct, - bool unit_diag, const gko::size_type num_rhs) + bool unit_diag, const solver::trisolve_algorithm algorithm, + const gko::size_type num_rhs) { generate_kernel(exec, matrix, solve_struct, num_rhs, false, unit_diag); diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp index 823bedecf75..c5e831b3be1 100644 --- a/hip/solver/upper_trs_kernels.hip.cpp +++ b/hip/solver/upper_trs_kernels.hip.cpp @@ -73,7 +73,8 @@ template void generate(std::shared_ptr exec, const matrix::Csr* matrix, std::shared_ptr& solve_struct, - bool unit_diag, const gko::size_type num_rhs) + bool unit_diag, const solver::trisolve_algorithm algorithm, + const gko::size_type num_rhs) { generate_kernel(exec, matrix, solve_struct, num_rhs, true, unit_diag); diff --git a/include/ginkgo/core/solver/lower_trs.hpp b/include/ginkgo/core/solver/lower_trs.hpp index 8274af09e2a..55a5bcbe20e 100644 --- a/include/ginkgo/core/solver/lower_trs.hpp +++ b/include/ginkgo/core/solver/lower_trs.hpp @@ -116,6 +116,15 @@ class LowerTrs : public EnableLinOp>, * (false) or should it assume they are 1.0 (true)? */ bool GKO_FACTORY_PARAMETER_SCALAR(unit_diagonal, false); + + /** + * Select the implementation which is supposed to be used for + * the triangular solver. This only matters for the Cuda + * executor where the choice is between the Ginkgo and the + * Cusparse implementation. Default is Ginkgo. + */ + trisolve_algorithm GKO_FACTORY_PARAMETER_SCALAR( + algorithm, trisolve_algorithm::ginkgo); }; GKO_ENABLE_LIN_OP_FACTORY(LowerTrs, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/include/ginkgo/core/solver/solver_base.hpp b/include/ginkgo/core/solver/solver_base.hpp index 107eb2e377d..8b294d52bf6 100644 --- a/include/ginkgo/core/solver/solver_base.hpp +++ b/include/ginkgo/core/solver/solver_base.hpp @@ -590,6 +590,14 @@ class EnablePreconditionedIterativeSolver }; +/** + * A helper for algorithm selection in the triangular solvers. + * It currently only matters for the Cuda executor as there, + * we have a choice between the Ginkgo and Cusparse implementations. + */ +enum class trisolve_algorithm { sparselib, ginkgo }; + + } // namespace solver } // namespace gko diff --git a/include/ginkgo/core/solver/upper_trs.hpp b/include/ginkgo/core/solver/upper_trs.hpp index 09356de8da6..36855a1fc95 100644 --- a/include/ginkgo/core/solver/upper_trs.hpp +++ b/include/ginkgo/core/solver/upper_trs.hpp @@ -116,6 +116,15 @@ class UpperTrs : public EnableLinOp>, * (false) or should it assume they are 1.0 (true)? */ bool GKO_FACTORY_PARAMETER_SCALAR(unit_diagonal, false); + + /** + * Select the implementation which is supposed to be used for + * the triangular solver. This only matters for the Cuda + * executor where the choice is between the Ginkgo and the + * Cusparse implementation. Default is Ginkgo. + */ + trisolve_algorithm GKO_FACTORY_PARAMETER_SCALAR( + algorithm, trisolve_algorithm::ginkgo); }; GKO_ENABLE_LIN_OP_FACTORY(UpperTrs, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); diff --git a/omp/solver/lower_trs_kernels.cpp b/omp/solver/lower_trs_kernels.cpp index 0b373f8f270..a32d39f9034 100644 --- a/omp/solver/lower_trs_kernels.cpp +++ b/omp/solver/lower_trs_kernels.cpp @@ -70,7 +70,8 @@ template void generate(std::shared_ptr exec, const matrix::Csr* matrix, std::shared_ptr& solve_struct, - bool unit_diag, const gko::size_type num_rhs) + bool unit_diag, const solver::trisolve_algorithm algorithm, + const gko::size_type num_rhs) { // This generate kernel is here to allow for a more sophisticated // implementation as for other executors. This kernel would perform the diff --git a/omp/solver/upper_trs_kernels.cpp b/omp/solver/upper_trs_kernels.cpp index f6994c0f238..9929270cc64 100644 --- a/omp/solver/upper_trs_kernels.cpp +++ b/omp/solver/upper_trs_kernels.cpp @@ -70,7 +70,8 @@ template void generate(std::shared_ptr exec, const matrix::Csr* matrix, std::shared_ptr& solve_struct, - bool unit_diag, const gko::size_type num_rhs) + bool unit_diag, const solver::trisolve_algorithm algorithm, + const gko::size_type num_rhs) { // This generate kernel is here to allow for a more sophisticated // implementation as for other executors. This kernel would perform the diff --git a/reference/solver/lower_trs_kernels.cpp b/reference/solver/lower_trs_kernels.cpp index 20a1480a04c..4b075de2f98 100644 --- a/reference/solver/lower_trs_kernels.cpp +++ b/reference/solver/lower_trs_kernels.cpp @@ -66,7 +66,8 @@ template void generate(std::shared_ptr exec, const matrix::Csr* matrix, std::shared_ptr& solve_struct, - bool unit_diag, const gko::size_type num_rhs) + bool unit_diag, const solver::trisolve_algorithm algorithm, + const gko::size_type num_rhs) { // This generate kernel is here to allow for a more sophisticated // implementation as for other executors. This kernel would perform the diff --git a/reference/solver/upper_trs_kernels.cpp b/reference/solver/upper_trs_kernels.cpp index 315d0a1d495..450b7c86aea 100644 --- a/reference/solver/upper_trs_kernels.cpp +++ b/reference/solver/upper_trs_kernels.cpp @@ -66,7 +66,8 @@ template void generate(std::shared_ptr exec, const matrix::Csr* matrix, std::shared_ptr& solve_struct, - bool unit_diag, const gko::size_type num_rhs) + bool unit_diag, const solver::trisolve_algorithm algorithm, + const gko::size_type num_rhs) { // This generate kernel is here to allow for a more sophisticated // implementation as for other executors. This kernel would perform the