From e622c255ee0e9191bccf3d5cfe2f00b1750cea49 Mon Sep 17 00:00:00 2001
From: Lukas Riemer <lksriemer@gmail.com>
Date: Fri, 7 May 2021 18:55:11 +0200
Subject: [PATCH 01/15] Add naive lower triangular solver

---
 cuda/solver/lower_trs_kernels.cu | 131 ++++++++++++++++++++++++++++++-
 1 file changed, 128 insertions(+), 3 deletions(-)
diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index afbd545f1f1..2ac2cc7d8ed 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -62,6 +62,127 @@ namespace cuda {
 namespace lower_trs {
 
 
+constexpr int default_block_size = 512;
+
+
+template <typename ValueType>
+__device__ std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType>
+load(const ValueType *values, int index)
+{
+    const volatile ValueType *val = values + index;
+    return *val;
+}
+
+template <typename ValueType>
+__device__ std::enable_if_t<std::is_floating_point<ValueType>::value,
+                            thrust::complex<ValueType>>
+load(const thrust::complex<ValueType> *values, int index)
+{
+    auto real = reinterpret_cast<const ValueType *>(values);
+    auto imag = real + 1;
+    return {load(real, 2 * index), load(imag, 2 * index)};
+}
+
+template <typename ValueType>
+__device__ void store(
+    ValueType *values, int index,
+    std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType> value)
+{
+    volatile ValueType *val = values + index;
+    *val = value;
+}
+
+template <typename ValueType>
+__device__ void store(thrust::complex<ValueType> *values, int index,
+                      thrust::complex<ValueType> value)
+{
+    auto real = reinterpret_cast<ValueType *>(values);
+    auto imag = real + 1;
+    store(real, 2 * index, value.real());
+    store(imag, 2 * index, value.imag());
+}
+
+template <typename ValueType, typename IndexType>
+__global__ void sptrsv_naive_caching_kernel(
+    const IndexType *const rowptrs, const IndexType *const colidxs,
+    const ValueType *const vals, const ValueType *const b, ValueType *const x,
+    volatile uint32 *const ready, const size_type n)
+{
+    const auto gid = blockIdx.x * blockDim.x + threadIdx.x;
+    const auto row = gid;
+
+    if (row >= n) {
+        return;
+    }
+
+    const auto self_shmem_id = gid / default_block_size;
+    const auto self_shid = gid % default_block_size;
+
+    volatile __shared__ uint32 ready_s[default_block_size];
+#pragma diag_suppress static_var_with_dynamic_init
+    __shared__ ValueType x_s[default_block_size];
+    ready_s[self_shid] = 0;
+    x_s[self_shid] = zero<ValueType>();
+
+    __syncthreads();
+
+    const auto row_start = rowptrs[row];
+    const auto row_end = rowptrs[row + 1] - 1;
+
+    ValueType sum = 0.0;
+    for (IndexType i = row_start; i < row_end; ++i) {
+        const auto dependency = colidxs[i];
+        volatile auto is_ready = &ready[dependency];
+        auto x_p = &x[dependency];
+
+        const bool shmem_possible =
+            dependency / default_block_size == self_shmem_id;
+        if (shmem_possible) {
+            const auto dependency_shid = dependency % default_block_size;
+            is_ready = &ready_s[dependency_shid];
+            x_p = &x_s[dependency_shid];
+        }
+
+        uint32 is_ready_v = false;
+        while (!is_ready_v) {
+            is_ready_v = *is_ready;
+        }
+
+        sum += load(x_p, 0) * vals[i];
+    }
+
+    const auto r = (b[row] - sum) / vals[rowptrs[row + 1] - 1];
+
+    store(x_s, self_shid, r);
+    __threadfence_block();
+    ready_s[self_shid] = 1;
+
+    x[row] = r;
+    __threadfence();
+    ready[row] = 1;
+}
+
+template <typename ValueType, typename IndexType>
+void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
+                          const matrix::Csr<ValueType, IndexType> *matrix,
+                          const matrix::Dense<ValueType> *b,
+                          matrix::Dense<ValueType> *x)
+{
+    const IndexType n = matrix->get_size()[0];
+
+    Array<uint32> ready(exec, n);
+    cudaMemset(ready.get_data(), 0, n * sizeof(uint32));
+
+    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 grid_size(ceildiv(n, block_size.x), 1, 1);
+
+    sptrsv_naive_caching_kernel<<<grid_size, block_size>>>(
+        matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+        as_cuda_type(matrix->get_const_values()),
+        as_cuda_type(b->get_const_values()), as_cuda_type(x->get_values()),
+        ready.get_data(), n);
+}
+
 void should_perform_transpose(std::shared_ptr<const CudaExecutor> exec,
                               bool& do_transpose)
 {
@@ -88,7 +209,6 @@ void generate(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 
-
 template <typename ValueType, typename IndexType>
 void solve(std::shared_ptr<const CudaExecutor> exec,
            const matrix::Csr<ValueType, IndexType>* matrix,
@@ -96,8 +216,13 @@ void solve(std::shared_ptr<const CudaExecutor> exec,
            matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
            const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* x)
 {
-    solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
-                                       trans_x, b, x);
+    if (matrix->get_strategy()->get_name() == "sparselib") {
+        solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
+                                           trans_x, b, x);
+
+    } else {
+        sptrsv_naive_caching(exec, matrix, b, x);
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

From 60eb66146c3f9d9de87619c6083ad1291b61c352 Mon Sep 17 00:00:00 2001
From: Lukas Riemer <lksriemer@gmail.com>
Date: Fri, 7 May 2021 22:06:06 +0200
Subject: [PATCH 02/15] Review updates

---
 cuda/solver/lower_trs_kernels.cu | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index 2ac2cc7d8ed..f8e78dd2a5a 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -48,6 +48,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/solver/common_trs_kernels.cuh"
 
 
@@ -108,7 +110,7 @@ __global__ void sptrsv_naive_caching_kernel(
     const ValueType *const vals, const ValueType *const b, ValueType *const x,
     volatile uint32 *const ready, const size_type n)
 {
-    const auto gid = blockIdx.x * blockDim.x + threadIdx.x;
+    const auto gid = thread::get_thread_id_flat<IndexType>();
     const auto row = gid;
 
     if (row >= n) {
@@ -119,8 +121,8 @@ __global__ void sptrsv_naive_caching_kernel(
     const auto self_shid = gid % default_block_size;
 
     volatile __shared__ uint32 ready_s[default_block_size];
-#pragma diag_suppress static_var_with_dynamic_init
-    __shared__ ValueType x_s[default_block_size];
+    __shared__ UninitializedArray<ValueType, default_block_size> x_s_array;
+    ValueType *x_s = x_s_array;
     ready_s[self_shid] = 0;
     x_s[self_shid] = zero<ValueType>();
 
@@ -183,6 +185,7 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
         ready.get_data(), n);
 }
 
+
 void should_perform_transpose(std::shared_ptr<const CudaExecutor> exec,
                               bool& do_transpose)
 {
@@ -202,13 +205,16 @@ void generate(std::shared_ptr<const CudaExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
               solver::SolveStruct* solve_struct, const gko::size_type num_rhs)
 {
-    generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct, num_rhs,
-                                          false);
+    if (matrix->get_strategy()->get_name() == "sparselib") {
+        generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct,
+                                              num_rhs, false);
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 
+
 template <typename ValueType, typename IndexType>
 void solve(std::shared_ptr<const CudaExecutor> exec,
            const matrix::Csr<ValueType, IndexType>* matrix,

From ed1962d40e7c950ac8590aee6bb23aafd1fd07b5 Mon Sep 17 00:00:00 2001
From: Lukas Riemer <lksriemer@gmail.com>
Date: Fri, 4 Jun 2021 10:27:13 +0200
Subject: [PATCH 03/15] Add fallback sptrsv kernel

---
 cuda/solver/lower_trs_kernels.cu | 58 ++++++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index f8e78dd2a5a..f8c22e7ad0d 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -104,6 +104,7 @@ __device__ void store(thrust::complex<ValueType> *values, int index,
     store(imag, 2 * index, value.imag());
 }
 
+
 template <typename ValueType, typename IndexType>
 __global__ void sptrsv_naive_caching_kernel(
     const IndexType *const rowptrs, const IndexType *const colidxs,
@@ -153,7 +154,7 @@ __global__ void sptrsv_naive_caching_kernel(
         sum += load(x_p, 0) * vals[i];
     }
 
-    const auto r = (b[row] - sum) / vals[rowptrs[row + 1] - 1];
+    const auto r = (b[row] - sum) / vals[row_end];
 
     store(x_s, self_shid, r);
     __threadfence_block();
@@ -164,6 +165,41 @@ __global__ void sptrsv_naive_caching_kernel(
     ready[row] = 1;
 }
 
+
+template <typename ValueType, typename IndexType>
+__global__ void sptrsv_naive_legacy_kernel(
+    const IndexType *const rowptrs, const IndexType *const colidxs,
+    const ValueType *const vals, const ValueType *const b, ValueType *const x,
+    volatile uint32 *const ready, const size_type n)
+{
+    const auto gid = thread::get_thread_id_flat<IndexType>();
+    const auto row = gid;
+
+    if (row >= n) {
+        return;
+    }
+
+    const auto row_end = rowptrs[row + 1] - 1;
+
+    ValueType sum = 0.0;
+    auto j = rowptrs[row];
+    while (j <= row_end) {
+        auto col = colidxs[j];
+        while (ready[col]) {
+            sum += vals[j] * load(x, col);
+            ++j;
+            col = colidxs[j];
+        }
+        if (row == col) {
+            store(x, row, (b[row] - sum) / vals[row_end]);
+            ++j;
+            __threadfence();
+            ready[row] = 1;
+        }
+    }
+}
+
+
 template <typename ValueType, typename IndexType>
 void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
                           const matrix::Csr<ValueType, IndexType> *matrix,
@@ -178,11 +214,20 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
     const dim3 block_size(default_block_size, 1, 1);
     const dim3 grid_size(ceildiv(n, block_size.x), 1, 1);
 
-    sptrsv_naive_caching_kernel<<<grid_size, block_size>>>(
-        matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
-        as_cuda_type(matrix->get_const_values()),
-        as_cuda_type(b->get_const_values()), as_cuda_type(x->get_values()),
-        ready.get_data(), n);
+    // Pre-Volta GPUs may deadlock due to missing independent thread scheduling.
+    if (exec->get_major_version() < 7) {
+        sptrsv_naive_legacy_kernel<<<grid_size, block_size>>>(
+            matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+            as_cuda_type(matrix->get_const_values()),
+            as_cuda_type(b->get_const_values()), as_cuda_type(x->get_values()),
+            ready.get_data(), n);
+    } else {
+        sptrsv_naive_caching_kernel<<<grid_size, block_size>>>(
+            matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+            as_cuda_type(matrix->get_const_values()),
+            as_cuda_type(b->get_const_values()), as_cuda_type(x->get_values()),
+            ready.get_data(), n);
+    }
 }
 
 
@@ -225,7 +270,6 @@ void solve(std::shared_ptr<const CudaExecutor> exec,
     if (matrix->get_strategy()->get_name() == "sparselib") {
         solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
                                            trans_x, b, x);
-
     } else {
         sptrsv_naive_caching(exec, matrix, b, x);
     }

From a58b2502b023137f2d720963a6f6bc7e4f851c1e Mon Sep 17 00:00:00 2001
From: Lukas Riemer <lksriemer@gmail.com>
Date: Mon, 7 Jun 2021 11:55:07 +0200
Subject: [PATCH 04/15] Shrink trisolve block size for fallback

---
 cuda/solver/lower_trs_kernels.cu | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index f8c22e7ad0d..0b26c3592ff 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -206,16 +206,17 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
                           const matrix::Dense<ValueType> *b,
                           matrix::Dense<ValueType> *x)
 {
-    const IndexType n = matrix->get_size()[0];
+    // Pre-Volta GPUs may deadlock due to missing independent thread scheduling.
+    const auto is_fallback_required = exec->get_major_version() < 7;
 
+    const IndexType n = matrix->get_size()[0];
     Array<uint32> ready(exec, n);
     cudaMemset(ready.get_data(), 0, n * sizeof(uint32));
 
-    const dim3 block_size(default_block_size, 1, 1);
+    const dim3 block_size(is_fallback_required ? 32 : default_block_size, 1, 1);
     const dim3 grid_size(ceildiv(n, block_size.x), 1, 1);
 
-    // Pre-Volta GPUs may deadlock due to missing independent thread scheduling.
-    if (exec->get_major_version() < 7) {
+    if (is_fallback_required) {
         sptrsv_naive_legacy_kernel<<<grid_size, block_size>>>(
             matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
             as_cuda_type(matrix->get_const_values()),

From a6725125f3c4d8c181e055de494963d25f8be3c4 Mon Sep 17 00:00:00 2001
From: Lukas Riemer <lksriemer@gmail.com>
Date: Sun, 11 Jul 2021 11:20:38 +0200
Subject: [PATCH 05/15] Add NaN optimization to sptrsv

---
 cuda/solver/lower_trs_kernels.cu  | 64 +++++++++++++++++++------------
 include/ginkgo/core/base/math.hpp | 64 +++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 24 deletions(-)

diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index 0b26c3592ff..07c8cce583f 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -109,7 +109,7 @@ template <typename ValueType, typename IndexType>
 __global__ void sptrsv_naive_caching_kernel(
     const IndexType *const rowptrs, const IndexType *const colidxs,
     const ValueType *const vals, const ValueType *const b, ValueType *const x,
-    volatile uint32 *const ready, const size_type n)
+    const size_type n, bool *nan_produced)
 {
     const auto gid = thread::get_thread_id_flat<IndexType>();
     const auto row = gid;
@@ -121,11 +121,9 @@ __global__ void sptrsv_naive_caching_kernel(
     const auto self_shmem_id = gid / default_block_size;
     const auto self_shid = gid % default_block_size;
 
-    volatile __shared__ uint32 ready_s[default_block_size];
     __shared__ UninitializedArray<ValueType, default_block_size> x_s_array;
     ValueType *x_s = x_s_array;
-    ready_s[self_shid] = 0;
-    x_s[self_shid] = zero<ValueType>();
+    x_s[self_shid] = nan<ValueType>();
 
     __syncthreads();
 
@@ -135,34 +133,34 @@ __global__ void sptrsv_naive_caching_kernel(
     ValueType sum = 0.0;
     for (IndexType i = row_start; i < row_end; ++i) {
         const auto dependency = colidxs[i];
-        volatile auto is_ready = &ready[dependency];
         auto x_p = &x[dependency];
 
         const bool shmem_possible =
             dependency / default_block_size == self_shmem_id;
         if (shmem_possible) {
             const auto dependency_shid = dependency % default_block_size;
-            is_ready = &ready_s[dependency_shid];
             x_p = &x_s[dependency_shid];
         }
 
-        uint32 is_ready_v = false;
-        while (!is_ready_v) {
-            is_ready_v = *is_ready;
+        ValueType x = *x_p;
+        while (is_nan(x)) {
+            x = load(x_p, 0);
         }
 
-        sum += load(x_p, 0) * vals[i];
+        sum += x * vals[i];
     }
 
     const auto r = (b[row] - sum) / vals[row_end];
 
     store(x_s, self_shid, r);
-    __threadfence_block();
-    ready_s[self_shid] = 1;
-
     x[row] = r;
-    __threadfence();
-    ready[row] = 1;
+
+    // This check to ensure no infinte loops happen.
+    if (is_nan(r)) {
+        store(x_s, self_shid, zero<ValueType>());
+        x[row] = zero<ValueType>();
+        *nan_produced = true;
+    }
 }
 
 
@@ -170,7 +168,7 @@ template <typename ValueType, typename IndexType>
 __global__ void sptrsv_naive_legacy_kernel(
     const IndexType *const rowptrs, const IndexType *const colidxs,
     const ValueType *const vals, const ValueType *const b, ValueType *const x,
-    volatile uint32 *const ready, const size_type n)
+    const size_type n, bool *nan_produced)
 {
     const auto gid = thread::get_thread_id_flat<IndexType>();
     const auto row = gid;
@@ -185,16 +183,19 @@ __global__ void sptrsv_naive_legacy_kernel(
     auto j = rowptrs[row];
     while (j <= row_end) {
         auto col = colidxs[j];
-        while (ready[col]) {
+        while (!is_nan(load(x, col))) {
             sum += vals[j] * load(x, col);
             ++j;
             col = colidxs[j];
         }
         if (row == col) {
-            store(x, row, (b[row] - sum) / vals[row_end]);
+            const auto r = (b[row] - sum) / vals[row_end];
+            store(x, row, r);
             ++j;
-            __threadfence();
-            ready[row] = 1;
+            if (is_nan(r)) {
+                store(x, row, zero<ValueType>());
+                *nan_produced = true;
+            }
         }
     }
 }
@@ -210,8 +211,12 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
     const auto is_fallback_required = exec->get_major_version() < 7;
 
     const IndexType n = matrix->get_size()[0];
-    Array<uint32> ready(exec, n);
-    cudaMemset(ready.get_data(), 0, n * sizeof(uint32));
+
+    // Initialize x to all NaNs.
+    cudaMemset(x->get_values(), 0xFF, n * sizeof(ValueType));
+
+    Array<bool> nan_produced(exec, 1);
+    cudaMemset(nan_produced.get_data(), false, sizeof(bool));
 
     const dim3 block_size(is_fallback_required ? 32 : default_block_size, 1, 1);
     const dim3 grid_size(ceildiv(n, block_size.x), 1, 1);
@@ -221,13 +226,24 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
             matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
             as_cuda_type(matrix->get_const_values()),
             as_cuda_type(b->get_const_values()), as_cuda_type(x->get_values()),
-            ready.get_data(), n);
+            n, nan_produced.get_data());
     } else {
         sptrsv_naive_caching_kernel<<<grid_size, block_size>>>(
             matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
             as_cuda_type(matrix->get_const_values()),
             as_cuda_type(b->get_const_values()), as_cuda_type(x->get_values()),
-            ready.get_data(), n);
+            n, nan_produced.get_data());
+    }
+
+    nan_produced.set_executor(exec->get_master());
+    if (nan_produced.get_const_data()[0]) {
+#if GKO_VERBOSE_LEVEL >= 1
+        std::cerr
+            << "Error: triangular solve produced NaN, either not all diagonal "
+               "elements are nonzero, or the system is very ill-conditioned. "
+               "The NaN will be replaced with a zero."
+            << std::endl;
+#endif  // GKO_VERBOSE_LEVEL >= 1
     }
 }
 
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index 17ffc9f8bad..8af4cde253b 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -1096,6 +1096,70 @@ GKO_INLINE GKO_ATTRIBUTES T safe_divide(T a, T b)
 }
 
 
+/**
+ * Checks if a floating point number is NaN.
+ *
+ * @tparam T  type of the value to check
+ *
+ * @param value  value to check
+ *
+ * @return `true` if the value is NaN.
+ */
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
+is_nan(const T &value)
+{
+    return std::isnan(value);
+}
+
+
+/**
+ * Checks if any component of a complex value is NaN.
+ *
+ * @tparam T  complex type of the value to check
+ *
+ * @param value  complex value to check
+ *
+ * @return `true` if any component of the given value is NaN.
+ */
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool> is_nan(
+    const T &value)
+{
+    return std::isnan(value.real()) || std::isnan(value.imag());
+}
+
+
+/**
+ * Returns NaN of the given type.
+ *
+ * @tparam T  the type of the object
+ *
+ * @return NaN.
+ */
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t<!is_complex_s<T>::value, T>
+nan()
+{
+    return std::numeric_limits<T>::quiet_NaN();
+}
+
+
+/**
+ * Returns a complex with both components NaN.
+ *
+ * @tparam T  the type of the object
+ *
+ * @return complex{NaN, NaN}.
+ */
+template <typename T>
+GKO_INLINE GKO_ATTRIBUTES constexpr std::enable_if_t<is_complex_s<T>::value, T>
+nan()
+{
+    return T{nan<remove_complex<T>>(), nan<remove_complex<T>>()};
+}
+
+
 }  // namespace gko
 
 

From 009fb326e62bd135821228e2fb4c6fe87dca9c69 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 26 Aug 2021 15:02:16 +0200
Subject: [PATCH 06/15] Add CUDA generic API triangular solver

---
 core/device_hooks/common_kernels.inc.cpp |   2 -
 core/solver/lower_trs.cpp                |  11 +-
 core/solver/lower_trs_kernels.hpp        |  14 +-
 core/solver/upper_trs.cpp                |  11 +-
 core/solver/upper_trs_kernels.hpp        |  14 +-
 cuda/base/cusparse_bindings.hpp          | 161 +++++++++++--
 cuda/base/pointer_mode_guard.hpp         |  12 +-
 cuda/solver/common_trs_kernels.cuh       | 295 +++++++++++++++--------
 cuda/solver/lower_trs_kernels.cu         |  46 ++--
 cuda/solver/upper_trs_kernels.cu         |  10 +-
 dpcpp/solver/lower_trs_kernels.dp.cpp    |  20 +-
 dpcpp/solver/upper_trs_kernels.dp.cpp    |  16 +-
 hip/solver/lower_trs_kernels.hip.cpp     |  12 +-
 hip/solver/upper_trs_kernels.hip.cpp     |  15 +-
 include/ginkgo/core/solver/lower_trs.hpp |   3 -
 include/ginkgo/core/solver/upper_trs.hpp |   3 -
 omp/solver/lower_trs_kernels.cpp         |  11 +-
 omp/solver/upper_trs_kernels.cpp         |  11 +-
 omp/test/solver/lower_trs_kernels.cpp    |  15 +-
 omp/test/solver/upper_trs_kernels.cpp    |  15 +-
 reference/solver/lower_trs_kernels.cpp   |  11 +-
 reference/solver/upper_trs_kernels.cpp   |  11 +-
 22 files changed, 394 insertions(+), 325 deletions(-)

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index bbd9a43161a..a06eac4bab6 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -318,7 +318,6 @@ namespace lower_trs {
 
 
 GKO_STUB(GKO_DECLARE_LOWER_TRS_SHOULD_PERFORM_TRANSPOSE_KERNEL);
-GKO_STUB(GKO_DECLARE_LOWER_TRS_INIT_STRUCT_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL);
 
@@ -330,7 +329,6 @@ namespace upper_trs {
 
 
 GKO_STUB(GKO_DECLARE_UPPER_TRS_SHOULD_PERFORM_TRANSPOSE_KERNEL);
-GKO_STUB(GKO_DECLARE_UPPER_TRS_INIT_STRUCT_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL);
 
diff --git a/core/solver/lower_trs.cpp b/core/solver/lower_trs.cpp
index 7ae9e1b4858..b17840477e5 100644
--- a/core/solver/lower_trs.cpp
+++ b/core/solver/lower_trs.cpp
@@ -55,7 +55,6 @@ namespace {
 
 
 GKO_REGISTER_OPERATION(generate, lower_trs::generate);
-GKO_REGISTER_OPERATION(init_struct, lower_trs::init_struct);
 GKO_REGISTER_OPERATION(should_perform_transpose,
                        lower_trs::should_perform_transpose);
 GKO_REGISTER_OPERATION(solve, lower_trs::solve);
@@ -85,19 +84,11 @@ std::unique_ptr<LinOp> LowerTrs<ValueType, IndexType>::conj_transpose() const
 }
 
 
-template <typename ValueType, typename IndexType>
-void LowerTrs<ValueType, IndexType>::init_trs_solve_struct()
-{
-    this->get_executor()->run(lower_trs::make_init_struct(this->solve_struct_));
-}
-
-
 template <typename ValueType, typename IndexType>
 void LowerTrs<ValueType, IndexType>::generate()
 {
     this->get_executor()->run(lower_trs::make_generate(
-        gko::lend(system_matrix_), gko::lend(this->solve_struct_),
-        parameters_.num_rhs));
+        gko::lend(system_matrix_), this->solve_struct_, parameters_.num_rhs));
 }
 
 
diff --git a/core/solver/lower_trs_kernels.hpp b/core/solver/lower_trs_kernels.hpp
index 1aeb09918ab..78406c274dd 100644
--- a/core/solver/lower_trs_kernels.hpp
+++ b/core/solver/lower_trs_kernels.hpp
@@ -58,15 +58,10 @@ namespace lower_trs {
                                   bool& do_transpose)
 
 
-#define GKO_DECLARE_LOWER_TRS_INIT_STRUCT_KERNEL                  \
-    void init_struct(std::shared_ptr<const DefaultExecutor> exec, \
-                     std::shared_ptr<solver::SolveStruct>& solve_struct)
-
-
-#define GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL(_vtype, _itype)  \
-    void generate(std::shared_ptr<const DefaultExecutor> exec, \
-                  const matrix::Csr<_vtype, _itype>* matrix,   \
-                  solver::SolveStruct* solve_struct,           \
+#define GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL(_vtype, _itype)         \
+    void generate(std::shared_ptr<const DefaultExecutor> exec,        \
+                  const matrix::Csr<_vtype, _itype>* matrix,          \
+                  std::shared_ptr<solver::SolveStruct>& solve_struct, \
                   const gko::size_type num_rhs)
 
 
@@ -80,7 +75,6 @@ namespace lower_trs {
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                          \
     GKO_DECLARE_LOWER_TRS_SHOULD_PERFORM_TRANSPOSE_KERNEL;    \
-    GKO_DECLARE_LOWER_TRS_INIT_STRUCT_KERNEL;                 \
     template <typename ValueType, typename IndexType>         \
     GKO_DECLARE_LOWER_TRS_SOLVE_KERNEL(ValueType, IndexType); \
     template <typename ValueType, typename IndexType>         \
diff --git a/core/solver/upper_trs.cpp b/core/solver/upper_trs.cpp
index d109d2cbd65..7a26596b1c9 100644
--- a/core/solver/upper_trs.cpp
+++ b/core/solver/upper_trs.cpp
@@ -55,7 +55,6 @@ namespace {
 
 
 GKO_REGISTER_OPERATION(generate, upper_trs::generate);
-GKO_REGISTER_OPERATION(init_struct, upper_trs::init_struct);
 GKO_REGISTER_OPERATION(should_perform_transpose,
                        upper_trs::should_perform_transpose);
 GKO_REGISTER_OPERATION(solve, upper_trs::solve);
@@ -85,19 +84,11 @@ std::unique_ptr<LinOp> UpperTrs<ValueType, IndexType>::conj_transpose() const
 }
 
 
-template <typename ValueType, typename IndexType>
-void UpperTrs<ValueType, IndexType>::init_trs_solve_struct()
-{
-    this->get_executor()->run(upper_trs::make_init_struct(this->solve_struct_));
-}
-
-
 template <typename ValueType, typename IndexType>
 void UpperTrs<ValueType, IndexType>::generate()
 {
     this->get_executor()->run(upper_trs::make_generate(
-        gko::lend(system_matrix_), gko::lend(this->solve_struct_),
-        parameters_.num_rhs));
+        gko::lend(system_matrix_), this->solve_struct_, parameters_.num_rhs));
 }
 
 
diff --git a/core/solver/upper_trs_kernels.hpp b/core/solver/upper_trs_kernels.hpp
index ba35eb6373b..8dfa65e698c 100644
--- a/core/solver/upper_trs_kernels.hpp
+++ b/core/solver/upper_trs_kernels.hpp
@@ -58,15 +58,10 @@ namespace upper_trs {
                                   bool& do_transpose)
 
 
-#define GKO_DECLARE_UPPER_TRS_INIT_STRUCT_KERNEL                  \
-    void init_struct(std::shared_ptr<const DefaultExecutor> exec, \
-                     std::shared_ptr<gko::solver::SolveStruct>& solve_struct)
-
-
-#define GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL(_vtype, _itype)  \
-    void generate(std::shared_ptr<const DefaultExecutor> exec, \
-                  const matrix::Csr<_vtype, _itype>* matrix,   \
-                  solver::SolveStruct* solve_struct,           \
+#define GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL(_vtype, _itype)              \
+    void generate(std::shared_ptr<const DefaultExecutor> exec,             \
+                  const matrix::Csr<_vtype, _itype>* matrix,               \
+                  std::shared_ptr<gko::solver::SolveStruct>& solve_struct, \
                   const gko::size_type num_rhs)
 
 
@@ -80,7 +75,6 @@ namespace upper_trs {
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                          \
     GKO_DECLARE_UPPER_TRS_SHOULD_PERFORM_TRANSPOSE_KERNEL;    \
-    GKO_DECLARE_UPPER_TRS_INIT_STRUCT_KERNEL;                 \
     template <typename ValueType, typename IndexType>         \
     GKO_DECLARE_UPPER_TRS_SOLVE_KERNEL(ValueType, IndexType); \
     template <typename ValueType, typename IndexType>         \
diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp
index 5bfb8629240..5551bede3f7 100644
--- a/cuda/base/cusparse_bindings.hpp
+++ b/cuda/base/cusparse_bindings.hpp
@@ -745,10 +745,23 @@ inline cusparseMatDescr_t create_mat_descr()
 {
     cusparseMatDescr_t descr{};
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateMatDescr(&descr));
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(
+        cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(
+        cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(
+        cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
     return descr;
 }
 
 
+inline void set_mat_fill_mode(cusparseMatDescr_t descr,
+                              cusparseFillMode_t fill_mode)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSetMatFillMode(descr, fill_mode));
+}
+
+
 inline void destroy(cusparseMatDescr_t descr)
 {
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyMatDescr(descr));
@@ -806,6 +819,24 @@ inline void destroy(cusparseDnVecDescr_t descr)
 }
 
 
+template <typename ValueType>
+inline cusparseDnMatDescr_t create_dnmat(int64_t rows, int64_t cols,
+                                         int64_t stride, ValueType* values)
+{
+    cusparseDnMatDescr_t descr{};
+    constexpr auto value_type = cuda_data_type<ValueType>();
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateDnMat(
+        &descr, rows, cols, stride, values, value_type, CUSPARSE_ORDER_ROW));
+    return descr;
+}
+
+
+inline void destroy(cusparseDnMatDescr_t descr)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnMat(descr));
+}
+
+
 template <typename ValueType, typename IndexType>
 inline cusparseSpVecDescr_t create_spvec(int64_t size, int64_t nnz,
                                          IndexType* indices, ValueType* values)
@@ -847,7 +878,37 @@ inline void destroy(cusparseSpMatDescr_t descr)
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(descr));
 }
 
-#endif
+
+#if (CUDA_VERSION >= 11030)
+
+
+template <typename AttribType>
+inline void set_attribute(cusparseSpMatDescr_t desc,
+                          cusparseSpMatAttribute_t attr, AttribType val)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(
+        cusparseSpMatSetAttribute(desc, attr, &val, sizeof(val)));
+}
+
+
+inline cusparseSpSMDescr_t create_spsm_descr()
+{
+    cusparseSpSMDescr_t desc{};
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpSM_createDescr(&desc));
+    return desc;
+}
+
+
+inline void destroy(cusparseSpSMDescr_t info)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpSM_destroyDescr(info));
+}
+
+
+#endif  // CUDA_VERSION >= 11030
+
+
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000)
 
 
 inline csrsm2Info_t create_solve_info()
@@ -896,7 +957,7 @@ inline void destroy(csric02Info_t info)
     inline void buffer_size_ext(                                              \
         cusparseHandle_t handle, int algo, cusparseOperation_t trans1,        \
         cusparseOperation_t trans2, size_type m, size_type n, size_type nnz,  \
-        const ValueType* one, const cusparseMatDescr_t descr,                 \
+        ValueType one, const cusparseMatDescr_t descr,                        \
         const ValueType* csrVal, const int32* csrRowPtr,                      \
         const int32* csrColInd, const ValueType* rhs, int32 sol_size,         \
         csrsm2Info_t factor_info, cusparseSolvePolicy_t policy,               \
@@ -904,7 +965,7 @@ inline void destroy(csric02Info_t info)
     {                                                                         \
         GKO_ASSERT_NO_CUSPARSE_ERRORS(                                        \
             CusparseName(handle, algo, trans1, trans2, m, n, nnz,             \
-                         as_culibs_type(one), descr, as_culibs_type(csrVal),  \
+                         as_culibs_type(&one), descr, as_culibs_type(csrVal), \
                          csrRowPtr, csrColInd, as_culibs_type(rhs), sol_size, \
                          factor_info, policy, factor_work_size));             \
     }                                                                         \
@@ -916,7 +977,7 @@ inline void destroy(csric02Info_t info)
     inline void buffer_size_ext(                                             \
         cusparseHandle_t handle, int algo, cusparseOperation_t trans1,       \
         cusparseOperation_t trans2, size_type m, size_type n, size_type nnz, \
-        const ValueType* one, const cusparseMatDescr_t descr,                \
+        ValueType one, const cusparseMatDescr_t descr,                       \
         const ValueType* csrVal, const int64* csrRowPtr,                     \
         const int64* csrColInd, const ValueType* rhs, int64 sol_size,        \
         csrsm2Info_t factor_info, cusparseSolvePolicy_t policy,              \
@@ -949,7 +1010,7 @@ GKO_BIND_CUSPARSE64_BUFFERSIZEEXT(ValueType, detail::not_implemented);
     inline void csrsm2_analysis(                                              \
         cusparseHandle_t handle, int algo, cusparseOperation_t trans1,        \
         cusparseOperation_t trans2, size_type m, size_type n, size_type nnz,  \
-        const ValueType* one, const cusparseMatDescr_t descr,                 \
+        ValueType one, const cusparseMatDescr_t descr,                        \
         const ValueType* csrVal, const int32* csrRowPtr,                      \
         const int32* csrColInd, const ValueType* rhs, int32 sol_size,         \
         csrsm2Info_t factor_info, cusparseSolvePolicy_t policy,               \
@@ -957,7 +1018,7 @@ GKO_BIND_CUSPARSE64_BUFFERSIZEEXT(ValueType, detail::not_implemented);
     {                                                                         \
         GKO_ASSERT_NO_CUSPARSE_ERRORS(                                        \
             CusparseName(handle, algo, trans1, trans2, m, n, nnz,             \
-                         as_culibs_type(one), descr, as_culibs_type(csrVal),  \
+                         as_culibs_type(&one), descr, as_culibs_type(csrVal), \
                          csrRowPtr, csrColInd, as_culibs_type(rhs), sol_size, \
                          factor_info, policy, factor_work_vec));              \
     }                                                                         \
@@ -969,7 +1030,7 @@ GKO_BIND_CUSPARSE64_BUFFERSIZEEXT(ValueType, detail::not_implemented);
     inline void csrsm2_analysis(                                             \
         cusparseHandle_t handle, int algo, cusparseOperation_t trans1,       \
         cusparseOperation_t trans2, size_type m, size_type n, size_type nnz, \
-        const ValueType* one, const cusparseMatDescr_t descr,                \
+        ValueType one, const cusparseMatDescr_t descr,                       \
         const ValueType* csrVal, const int64* csrRowPtr,                     \
         const int64* csrColInd, const ValueType* rhs, int64 sol_size,        \
         csrsm2Info_t factor_info, cusparseSolvePolicy_t policy,              \
@@ -998,31 +1059,31 @@ GKO_BIND_CUSPARSE64_CSRSM2_ANALYSIS(ValueType, detail::not_implemented);
 #undef GKO_BIND_CUSPARSE64_CSRSM2_ANALYSIS
 
 
-#define GKO_BIND_CUSPARSE32_CSRSM2_SOLVE(ValueType, CusparseName)            \
-    inline void csrsm2_solve(                                                \
-        cusparseHandle_t handle, int algo, cusparseOperation_t trans1,       \
-        cusparseOperation_t trans2, size_type m, size_type n, size_type nnz, \
-        const ValueType* one, const cusparseMatDescr_t descr,                \
-        const ValueType* csrVal, const int32* csrRowPtr,                     \
-        const int32* csrColInd, ValueType* rhs, int32 sol_stride,            \
-        csrsm2Info_t factor_info, cusparseSolvePolicy_t policy,              \
-        void* factor_work_vec)                                               \
-    {                                                                        \
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                       \
-            CusparseName(handle, algo, trans1, trans2, m, n, nnz,            \
-                         as_culibs_type(one), descr, as_culibs_type(csrVal), \
-                         csrRowPtr, csrColInd, as_culibs_type(rhs),          \
-                         sol_stride, factor_info, policy, factor_work_vec)); \
-    }                                                                        \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
+#define GKO_BIND_CUSPARSE32_CSRSM2_SOLVE(ValueType, CusparseName)             \
+    inline void csrsm2_solve(                                                 \
+        cusparseHandle_t handle, int algo, cusparseOperation_t trans1,        \
+        cusparseOperation_t trans2, size_type m, size_type n, size_type nnz,  \
+        ValueType one, const cusparseMatDescr_t descr,                        \
+        const ValueType* csrVal, const int32* csrRowPtr,                      \
+        const int32* csrColInd, ValueType* rhs, int32 sol_stride,             \
+        csrsm2Info_t factor_info, cusparseSolvePolicy_t policy,               \
+        void* factor_work_vec)                                                \
+    {                                                                         \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(                                        \
+            CusparseName(handle, algo, trans1, trans2, m, n, nnz,             \
+                         as_culibs_type(&one), descr, as_culibs_type(csrVal), \
+                         csrRowPtr, csrColInd, as_culibs_type(rhs),           \
+                         sol_stride, factor_info, policy, factor_work_vec));  \
+    }                                                                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
                   "semi-colon warnings")
 
 #define GKO_BIND_CUSPARSE64_CSRSM2_SOLVE(ValueType, CusparseName)            \
     inline void csrsm2_solve(                                                \
         cusparseHandle_t handle, int algo, cusparseOperation_t trans1,       \
         cusparseOperation_t trans2, size_type m, size_type n, size_type nnz, \
-        const ValueType* one, const cusparseMatDescr_t descr,                \
+        ValueType one, const cusparseMatDescr_t descr,                       \
         const ValueType* csrVal, const int64* csrRowPtr,                     \
         const int64* csrColInd, ValueType* rhs, int64 sol_stride,            \
         csrsm2Info_t factor_info, cusparseSolvePolicy_t policy,              \
@@ -1047,6 +1108,54 @@ GKO_BIND_CUSPARSE64_CSRSM2_SOLVE(ValueType, detail::not_implemented);
 #undef GKO_BIND_CUSPARSE64_CSRSM2_SOLVE
 
 
+#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11030))
+
+
+template <typename ValueType>
+size_type spsm_buffer_size(cusparseHandle_t handle, cusparseOperation_t op_a,
+                           cusparseOperation_t op_b, ValueType alpha,
+                           cusparseSpMatDescr_t descr_a,
+                           cusparseDnMatDescr_t descr_b,
+                           cusparseDnMatDescr_t descr_c, cusparseSpSMAlg_t algo,
+                           cusparseSpSMDescr_t spsm_descr)
+{
+    size_type work_size;
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpSM_bufferSize(
+        handle, op_a, op_b, &alpha, descr_a, descr_b, descr_c,
+        cuda_data_type<ValueType>(), algo, spsm_descr, &work_size));
+    return work_size;
+}
+
+
+template <typename ValueType>
+void spsm_analysis(cusparseHandle_t handle, cusparseOperation_t op_a,
+                   cusparseOperation_t op_b, ValueType alpha,
+                   cusparseSpMatDescr_t descr_a, cusparseDnMatDescr_t descr_b,
+                   cusparseDnMatDescr_t descr_c, cusparseSpSMAlg_t algo,
+                   cusparseSpSMDescr_t spsm_descr, void* work)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpSM_analysis(
+        handle, op_a, op_b, &alpha, descr_a, descr_b, descr_c,
+        cuda_data_type<ValueType>(), algo, spsm_descr, work));
+}
+
+
+template <typename ValueType>
+void spsm_solve(cusparseHandle_t handle, cusparseOperation_t op_a,
+                cusparseOperation_t op_b, ValueType alpha,
+                cusparseSpMatDescr_t descr_a, cusparseDnMatDescr_t descr_b,
+                cusparseDnMatDescr_t descr_c, cusparseSpSMAlg_t algo,
+                cusparseSpSMDescr_t spsm_descr)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpSM_solve(
+        handle, op_a, op_b, &alpha, descr_a, descr_b, descr_c,
+        cuda_data_type<ValueType>(), algo, spsm_descr));
+}
+
+
+#endif
+
+
 template <typename IndexType>
 void create_identity_permutation(cusparseHandle_t handle, IndexType size,
                                  IndexType* permutation) GKO_NOT_IMPLEMENTED;
diff --git a/cuda/base/pointer_mode_guard.hpp b/cuda/base/pointer_mode_guard.hpp
index 34923d21abf..f9ed73f4ba6 100644
--- a/cuda/base/pointer_mode_guard.hpp
+++ b/cuda/base/pointer_mode_guard.hpp
@@ -108,9 +108,9 @@ namespace cusparse {
  */
 class pointer_mode_guard {
 public:
-    pointer_mode_guard(cusparseHandle_t& handle)
+    pointer_mode_guard(cusparseHandle_t handle)
     {
-        l_handle = &handle;
+        l_handle = handle;
         GKO_ASSERT_NO_CUSPARSE_ERRORS(
             cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST));
     }
@@ -127,15 +127,15 @@ class pointer_mode_guard {
     {
         /* Ignore the error during stack unwinding for this call */
         if (std::uncaught_exception()) {
-            cusparseSetPointerMode(*l_handle, CUSPARSE_POINTER_MODE_DEVICE);
+            cusparseSetPointerMode(l_handle, CUSPARSE_POINTER_MODE_DEVICE);
         } else {
-            GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSetPointerMode(
-                *l_handle, CUSPARSE_POINTER_MODE_DEVICE));
+            GKO_ASSERT_NO_CUSPARSE_ERRORS(
+                cusparseSetPointerMode(l_handle, CUSPARSE_POINTER_MODE_DEVICE));
         }
     }
 
 private:
-    cusparseHandle_t* l_handle;
+    cusparseHandle_t l_handle;
 };
 
 
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 2c01e6a76f5..20fc525a1a6 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -35,6 +35,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <functional>
+#include <iostream>
 #include <memory>
 
 
@@ -59,64 +60,215 @@ namespace solver {
 
 
 struct SolveStruct {
-    virtual ~SolveStruct() {}
+    virtual ~SolveStruct() = default;
 };
 
 
+}  // namespace solver
+
+
+namespace kernels {
 namespace cuda {
+namespace {
+
+
+#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11030))
+
+
+template <typename ValueType, typename IndexType>
+struct CudaSolveStruct : gko::solver::SolveStruct {
+    cusparseHandle_t handle;
+    cusparseSpSMDescr_t spsm_descr;
+    cusparseSpMatDescr_t descr_a;
+    Array<char> work;
+
+    CudaSolveStruct(std::shared_ptr<const gko::CudaExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* matrix,
+                    size_type num_rhs, bool is_upper)
+        : handle{exec->get_cusparse_handle()},
+          spsm_descr{},
+          descr_a{},
+          work{exec}
+    {
+        cusparse::pointer_mode_guard pm_guard(handle);
+        spsm_descr = cusparse::create_spsm_descr();
+        descr_a = cusparse::create_csr(
+            matrix->get_size()[0], matrix->get_size()[1],
+            matrix->get_num_stored_elements(),
+            const_cast<IndexType*>(matrix->get_const_row_ptrs()),
+            const_cast<IndexType*>(matrix->get_const_col_idxs()),
+            const_cast<ValueType*>(matrix->get_const_values()));
+        cusparse::set_attribute<cusparseFillMode_t>(
+            descr_a, CUSPARSE_SPMAT_FILL_MODE,
+            is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER);
+        cusparse::set_attribute<cusparseDiagType_t>(
+            descr_a, CUSPARSE_SPMAT_DIAG_TYPE, CUSPARSE_DIAG_TYPE_NON_UNIT);
+
+        const auto rows = matrix->get_size()[0];
+        // workaround suggested by NVIDIA engineers: for some reason
+        // cusparse needs non-nullptr input vectors even for analysis
+        auto descr_b = cusparse::create_dnmat(
+            matrix->get_size()[0], num_rhs, matrix->get_size()[1],
+            reinterpret_cast<ValueType*>(0xFF));
+        auto descr_c = cusparse::create_dnmat(
+            matrix->get_size()[0], num_rhs, matrix->get_size()[1],
+            reinterpret_cast<ValueType*>(0xFF));
+
+        auto work_size = cusparse::spsm_buffer_size(
+            handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            CUSPARSE_OPERATION_NON_TRANSPOSE, one<ValueType>(), descr_a,
+            descr_b, descr_c, CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr);
+
+        work.resize_and_reset(work_size);
+
+        cusparse::spsm_analysis(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                one<ValueType>(), descr_a, descr_b, descr_c,
+                                CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr,
+                                work.get_data());
+
+        cusparse::destroy(descr_b);
+        cusparse::destroy(descr_c);
+    }
+
+    void solve(const matrix::Csr<ValueType, IndexType>*,
+               const matrix::Dense<ValueType>* input,
+               matrix::Dense<ValueType>* output, matrix::Dense<ValueType>*,
+               matrix::Dense<ValueType>*) const
+    {
+        cusparse::pointer_mode_guard pm_guard(handle);
+        auto descr_b = cusparse::create_dnmat(
+            input->get_size()[0], input->get_size()[1], input->get_stride(),
+            const_cast<ValueType*>(input->get_const_values()));
+        auto descr_c =
+            cusparse::create_dnmat(output->get_size()[0], output->get_size()[1],
+                                   output->get_stride(), output->get_values());
+
+        cusparse::spsm_solve(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                             CUSPARSE_OPERATION_NON_TRANSPOSE, one<ValueType>(),
+                             descr_a, descr_b, descr_c,
+                             CUSPARSE_SPSM_ALG_DEFAULT, spsm_descr);
+
+        cusparse::destroy(descr_b);
+        cusparse::destroy(descr_c);
+    }
+
+    ~CudaSolveStruct()
+    {
+        if (descr_a) {
+            cusparse::destroy(descr_a);
+            descr_a = nullptr;
+        }
+        if (spsm_descr) {
+            cusparse::destroy(spsm_descr);
+            spsm_descr = nullptr;
+        }
+    }
+
+    CudaSolveStruct(const SolveStruct&) = delete;
+
+    CudaSolveStruct(SolveStruct&&) = delete;
+
+    CudaSolveStruct& operator=(const SolveStruct&) = delete;
+
+    CudaSolveStruct& operator=(SolveStruct&&) = delete;
+};
+
 
+#elif (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
 
-struct SolveStruct : gko::solver::SolveStruct {
+template <typename ValueType, typename IndexType>
+struct CudaSolveStruct : gko::solver::SolveStruct {
+    std::shared_ptr<const gko::CudaExecutor> exec;
+    cusparseHandle_t handle;
     int algorithm;
     csrsm2Info_t solve_info;
     cusparseSolvePolicy_t policy;
     cusparseMatDescr_t factor_descr;
-    size_t factor_work_size;
-    void* factor_work_vec;
-    SolveStruct()
+    mutable Array<char> work;
+
+    CudaSolveStruct(std::shared_ptr<const gko::CudaExecutor> exec,
+                    const matrix::Csr<ValueType, IndexType>* matrix,
+                    size_type num_rhs, bool is_upper)
+        : exec{exec},
+          handle{exec->get_cusparse_handle()},
+          algorithm{},
+          solve_info{},
+          policy{},
+          factor_descr{},
+          work{exec}
     {
-        factor_work_vec = nullptr;
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateMatDescr(&factor_descr));
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(
-            cusparseSetMatIndexBase(factor_descr, CUSPARSE_INDEX_BASE_ZERO));
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(
-            cusparseSetMatType(factor_descr, CUSPARSE_MATRIX_TYPE_GENERAL));
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(
-            cusparseSetMatDiagType(factor_descr, CUSPARSE_DIAG_TYPE_NON_UNIT));
-        GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsrsm2Info(&solve_info));
+        cusparse::pointer_mode_guard pm_guard(handle);
+        factor_descr = cusparse::create_mat_descr();
+        solve_info = cusparse::create_solve_info();
+        cusparse::set_mat_fill_mode(
+            factor_descr,
+            is_upper ? CUSPARSE_FILL_MODE_UPPER : CUSPARSE_FILL_MODE_LOWER);
         algorithm = 0;
         policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
-    }
-
-    SolveStruct(const SolveStruct&) = delete;
-
-    SolveStruct(SolveStruct&&) = delete;
 
-    SolveStruct& operator=(const SolveStruct&) = delete;
+        size_type work_size{};
+
+        cusparse::buffer_size_ext(
+            handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
+            matrix->get_num_stored_elements(), one<ValueType>(), factor_descr,
+            matrix->get_const_values(), matrix->get_const_row_ptrs(),
+            matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy,
+            &work_size);
+
+        // allocate workspace
+        work.resize_and_reset(work_size);
+
+        cusparse::csrsm2_analysis(
+            handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0], num_rhs,
+            matrix->get_num_stored_elements(), one<ValueType>(), factor_descr,
+            matrix->get_const_values(), matrix->get_const_row_ptrs(),
+            matrix->get_const_col_idxs(), nullptr, num_rhs, solve_info, policy,
+            work.get_data());
+    }
 
-    SolveStruct& operator=(SolveStruct&&) = delete;
+    void solve(const matrix::Csr<ValueType, IndexType>* matrix,
+               const matrix::Dense<ValueType>* input,
+               matrix::Dense<ValueType>* output, matrix::Dense<ValueType>*,
+               matrix::Dense<ValueType>*) const
+    {
+        cusparse::pointer_mode_guard pm_guard(handle);
+        dense::copy(exec, input, output);
+        cusparse::csrsm2_solve(
+            handle, algorithm, CUSPARSE_OPERATION_NON_TRANSPOSE,
+            CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0],
+            output->get_stride(), matrix->get_num_stored_elements(),
+            one<ValueType>(), factor_descr, matrix->get_const_values(),
+            matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+            output->get_values(), output->get_stride(), solve_info, policy,
+            work.get_data());
+    }
 
-    ~SolveStruct()
+    ~CudaSolveStruct()
     {
-        cusparseDestroyMatDescr(factor_descr);
-        if (solve_info) {
-            cusparseDestroyCsrsm2Info(solve_info);
+        if (factor_descr) {
+            cusparse::destroy(factor_descr);
+            factor_descr = nullptr;
         }
-        if (factor_work_vec != nullptr) {
-            cudaFree(factor_work_vec);
-            factor_work_vec = nullptr;
+        if (solve_info) {
+            cusparse::destroy(solve_info);
+            solve_info = nullptr;
         }
     }
-};
 
+    CudaSolveStruct(const CudaSolveStruct&) = delete;
 
-}  // namespace cuda
-}  // namespace solver
+    CudaSolveStruct(CudaSolveStruct&&) = delete;
 
+    CudaSolveStruct& operator=(const CudaSolveStruct&) = delete;
+
+    CudaSolveStruct& operator=(CudaSolveStruct&&) = delete;
+};
 
-namespace kernels {
-namespace cuda {
-namespace {
+
+#endif
 
 
 void should_perform_transpose_kernel(std::shared_ptr<const CudaExecutor> exec,
@@ -126,64 +278,15 @@ void should_perform_transpose_kernel(std::shared_ptr<const CudaExecutor> exec,
 }
 
 
-void init_struct_kernel(std::shared_ptr<const CudaExecutor> exec,
-                        std::shared_ptr<solver::SolveStruct>& solve_struct)
-{
-    solve_struct = std::make_shared<solver::cuda::SolveStruct>();
-}
-
-
 template <typename ValueType, typename IndexType>
 void generate_kernel(std::shared_ptr<const CudaExecutor> exec,
                      const matrix::Csr<ValueType, IndexType>* matrix,
-                     solver::SolveStruct* solve_struct,
+                     std::shared_ptr<solver::SolveStruct>& solve_struct,
                      const gko::size_type num_rhs, bool is_upper)
 {
     if (cusparse::is_supported<ValueType, IndexType>::value) {
-        if (auto cuda_solve_struct =
-                dynamic_cast<solver::cuda::SolveStruct*>(solve_struct)) {
-            auto handle = exec->get_cusparse_handle();
-            if (is_upper) {
-                GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSetMatFillMode(
-                    cuda_solve_struct->factor_descr, CUSPARSE_FILL_MODE_UPPER));
-            }
-
-            ValueType one = 1.0;
-
-            {
-                cusparse::pointer_mode_guard pm_guard(handle);
-                cusparse::buffer_size_ext(
-                    handle, cuda_solve_struct->algorithm,
-                    CUSPARSE_OPERATION_NON_TRANSPOSE,
-                    CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0],
-                    num_rhs, matrix->get_num_stored_elements(), &one,
-                    cuda_solve_struct->factor_descr, matrix->get_const_values(),
-                    matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
-                    nullptr, num_rhs, cuda_solve_struct->solve_info,
-                    cuda_solve_struct->policy,
-                    &cuda_solve_struct->factor_work_size);
-
-                // allocate workspace
-                if (cuda_solve_struct->factor_work_vec != nullptr) {
-                    exec->free(cuda_solve_struct->factor_work_vec);
-                }
-                cuda_solve_struct->factor_work_vec =
-                    exec->alloc<void*>(cuda_solve_struct->factor_work_size);
-
-                cusparse::csrsm2_analysis(
-                    handle, cuda_solve_struct->algorithm,
-                    CUSPARSE_OPERATION_NON_TRANSPOSE,
-                    CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0],
-                    num_rhs, matrix->get_num_stored_elements(), &one,
-                    cuda_solve_struct->factor_descr, matrix->get_const_values(),
-                    matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
-                    nullptr, num_rhs, cuda_solve_struct->solve_info,
-                    cuda_solve_struct->policy,
-                    cuda_solve_struct->factor_work_vec);
-            }
-        } else {
-            GKO_NOT_SUPPORTED(solve_struct);
-        }
+        solve_struct = std::make_shared<CudaSolveStruct<ValueType, IndexType>>(
+            exec, matrix, num_rhs, is_upper);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -203,23 +306,9 @@ void solve_kernel(std::shared_ptr<const CudaExecutor> exec,
 
     if (cusparse::is_supported<ValueType, IndexType>::value) {
         if (auto cuda_solve_struct =
-                dynamic_cast<const solver::cuda::SolveStruct*>(solve_struct)) {
-            ValueType one = 1.0;
-            auto handle = exec->get_cusparse_handle();
-            x->copy_from(gko::lend(b));
-            {
-                cusparse::pointer_mode_guard pm_guard(handle);
-                cusparse::csrsm2_solve(
-                    handle, cuda_solve_struct->algorithm,
-                    CUSPARSE_OPERATION_NON_TRANSPOSE,
-                    CUSPARSE_OPERATION_TRANSPOSE, matrix->get_size()[0],
-                    b->get_stride(), matrix->get_num_stored_elements(), &one,
-                    cuda_solve_struct->factor_descr, matrix->get_const_values(),
-                    matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
-                    x->get_values(), b->get_stride(),
-                    cuda_solve_struct->solve_info, cuda_solve_struct->policy,
-                    cuda_solve_struct->factor_work_vec);
-            }
+                dynamic_cast<const CudaSolveStruct<ValueType, IndexType>*>(
+                    solve_struct)) {
+            cuda_solve_struct->solve(matrix, b, x, trans_b, trans_x);
         } else {
             GKO_NOT_SUPPORTED(solve_struct);
         }
diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index 07c8cce583f..e07c0d19679 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -69,36 +69,36 @@ constexpr int default_block_size = 512;
 
 template <typename ValueType>
 __device__ std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType>
-load(const ValueType *values, int index)
+load(const ValueType* values, int index)
 {
-    const volatile ValueType *val = values + index;
+    const volatile ValueType* val = values + index;
     return *val;
 }
 
 template <typename ValueType>
 __device__ std::enable_if_t<std::is_floating_point<ValueType>::value,
                             thrust::complex<ValueType>>
-load(const thrust::complex<ValueType> *values, int index)
+load(const thrust::complex<ValueType>* values, int index)
 {
-    auto real = reinterpret_cast<const ValueType *>(values);
+    auto real = reinterpret_cast<const ValueType*>(values);
     auto imag = real + 1;
     return {load(real, 2 * index), load(imag, 2 * index)};
 }
 
 template <typename ValueType>
 __device__ void store(
-    ValueType *values, int index,
+    ValueType* values, int index,
     std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType> value)
 {
-    volatile ValueType *val = values + index;
+    volatile ValueType* val = values + index;
     *val = value;
 }
 
 template <typename ValueType>
-__device__ void store(thrust::complex<ValueType> *values, int index,
+__device__ void store(thrust::complex<ValueType>* values, int index,
                       thrust::complex<ValueType> value)
 {
-    auto real = reinterpret_cast<ValueType *>(values);
+    auto real = reinterpret_cast<ValueType*>(values);
     auto imag = real + 1;
     store(real, 2 * index, value.real());
     store(imag, 2 * index, value.imag());
@@ -107,9 +107,9 @@ __device__ void store(thrust::complex<ValueType> *values, int index,
 
 template <typename ValueType, typename IndexType>
 __global__ void sptrsv_naive_caching_kernel(
-    const IndexType *const rowptrs, const IndexType *const colidxs,
-    const ValueType *const vals, const ValueType *const b, ValueType *const x,
-    const size_type n, bool *nan_produced)
+    const IndexType* const rowptrs, const IndexType* const colidxs,
+    const ValueType* const vals, const ValueType* const b, ValueType* const x,
+    const size_type n, bool* nan_produced)
 {
     const auto gid = thread::get_thread_id_flat<IndexType>();
     const auto row = gid;
@@ -122,7 +122,7 @@ __global__ void sptrsv_naive_caching_kernel(
     const auto self_shid = gid % default_block_size;
 
     __shared__ UninitializedArray<ValueType, default_block_size> x_s_array;
-    ValueType *x_s = x_s_array;
+    ValueType* x_s = x_s_array;
     x_s[self_shid] = nan<ValueType>();
 
     __syncthreads();
@@ -166,9 +166,9 @@ __global__ void sptrsv_naive_caching_kernel(
 
 template <typename ValueType, typename IndexType>
 __global__ void sptrsv_naive_legacy_kernel(
-    const IndexType *const rowptrs, const IndexType *const colidxs,
-    const ValueType *const vals, const ValueType *const b, ValueType *const x,
-    const size_type n, bool *nan_produced)
+    const IndexType* const rowptrs, const IndexType* const colidxs,
+    const ValueType* const vals, const ValueType* const b, ValueType* const x,
+    const size_type n, bool* nan_produced)
 {
     const auto gid = thread::get_thread_id_flat<IndexType>();
     const auto row = gid;
@@ -203,9 +203,9 @@ __global__ void sptrsv_naive_legacy_kernel(
 
 template <typename ValueType, typename IndexType>
 void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
-                          const matrix::Csr<ValueType, IndexType> *matrix,
-                          const matrix::Dense<ValueType> *b,
-                          matrix::Dense<ValueType> *x)
+                          const matrix::Csr<ValueType, IndexType>* matrix,
+                          const matrix::Dense<ValueType>* b,
+                          matrix::Dense<ValueType>* x)
 {
     // Pre-Volta GPUs may deadlock due to missing independent thread scheduling.
     const auto is_fallback_required = exec->get_major_version() < 7;
@@ -255,17 +255,11 @@ void should_perform_transpose(std::shared_ptr<const CudaExecutor> exec,
 }
 
 
-void init_struct(std::shared_ptr<const CudaExecutor> exec,
-                 std::shared_ptr<solver::SolveStruct>& solve_struct)
-{
-    init_struct_kernel(exec, solve_struct);
-}
-
-
 template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const CudaExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
-              solver::SolveStruct* solve_struct, const gko::size_type num_rhs)
+              std::shared_ptr<solver::SolveStruct>& solve_struct,
+              const gko::size_type num_rhs)
 {
     if (matrix->get_strategy()->get_name() == "sparselib") {
         generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct,
diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu
index 527dc3958a7..1f6f9c283f2 100644
--- a/cuda/solver/upper_trs_kernels.cu
+++ b/cuda/solver/upper_trs_kernels.cu
@@ -69,17 +69,11 @@ void should_perform_transpose(std::shared_ptr<const CudaExecutor> exec,
 }
 
 
-void init_struct(std::shared_ptr<const CudaExecutor> exec,
-                 std::shared_ptr<solver::SolveStruct>& solve_struct)
-{
-    init_struct_kernel(exec, solve_struct);
-}
-
-
 template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const CudaExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
-              solver::SolveStruct* solve_struct, const gko::size_type num_rhs)
+              std::shared_ptr<solver::SolveStruct>& solve_struct,
+              const gko::size_type num_rhs)
 {
     generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct, num_rhs,
                                           true);
diff --git a/dpcpp/solver/lower_trs_kernels.dp.cpp b/dpcpp/solver/lower_trs_kernels.dp.cpp
index 5f25d4c057b..ab372fc45fc 100644
--- a/dpcpp/solver/lower_trs_kernels.dp.cpp
+++ b/dpcpp/solver/lower_trs_kernels.dp.cpp
@@ -59,27 +59,11 @@ namespace dpcpp {
 namespace lower_trs {
 
 
-void should_perform_transpose(std::shared_ptr<const DpcppExecutor> exec,
-                              bool& do_transpose) GKO_NOT_IMPLEMENTED;
-
-
-void init_struct(std::shared_ptr<const DpcppExecutor> exec,
-                 std::shared_ptr<solver::SolveStruct>& solve_struct)
-{
-    // This init kernel is here to allow initialization of the solve struct for
-    // a more sophisticated implementation as for other executors.
-}
-
-
 template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const DpcppExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
-              solver::SolveStruct* solve_struct, const gko::size_type num_rhs)
-{
-    // This generate kernel is here to allow for a more sophisticated
-    // implementation as for other executors. This kernel would perform the
-    // "analysis" phase for the triangular matrix.
-}
+              std::shared_ptr<solver::SolveStruct>& solve_struct,
+              const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_LOWER_TRS_GENERATE_KERNEL);
diff --git a/dpcpp/solver/upper_trs_kernels.dp.cpp b/dpcpp/solver/upper_trs_kernels.dp.cpp
index e46e7cb5195..e014095bb52 100644
--- a/dpcpp/solver/upper_trs_kernels.dp.cpp
+++ b/dpcpp/solver/upper_trs_kernels.dp.cpp
@@ -66,23 +66,11 @@ void should_perform_transpose(std::shared_ptr<const DpcppExecutor> exec,
 }
 
 
-void init_struct(std::shared_ptr<const DpcppExecutor> exec,
-                 std::shared_ptr<solver::SolveStruct>& solve_struct)
-{
-    // This init kernel is here to allow initialization of the solve struct for
-    // a more sophisticated implementation as for other executors.
-}
-
-
 template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const DpcppExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
-              solver::SolveStruct* solve_struct, const gko::size_type num_rhs)
-{
-    // This generate kernel is here to allow for a more sophisticated
-    // implementation as for other executors. This kernel would perform the
-    // "analysis" phase for the triangular matrix.
-}
+              std::shared_ptr<solver::SolveStruct>& solve_struct,
+              const gko::size_type num_rhs) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_UPPER_TRS_GENERATE_KERNEL);
diff --git a/hip/solver/lower_trs_kernels.hip.cpp b/hip/solver/lower_trs_kernels.hip.cpp
index 0f7751a039d..5a7cfde692b 100644
--- a/hip/solver/lower_trs_kernels.hip.cpp
+++ b/hip/solver/lower_trs_kernels.hip.cpp
@@ -71,18 +71,18 @@ void should_perform_transpose(std::shared_ptr<const HipExecutor> exec,
 
 void init_struct(std::shared_ptr<const HipExecutor> exec,
                  std::shared_ptr<solver::SolveStruct>& solve_struct)
-{
-    init_struct_kernel(exec, solve_struct);
-}
+{}
 
 
 template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const HipExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
-              solver::SolveStruct* solve_struct, const gko::size_type num_rhs)
+              std::shared_ptr<solver::SolveStruct>& solve_struct,
+              const gko::size_type num_rhs)
 {
-    generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct, num_rhs,
-                                          false);
+    init_struct_kernel(exec, solve_struct);
+    generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct.get(),
+                                          num_rhs, false);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/hip/solver/upper_trs_kernels.hip.cpp b/hip/solver/upper_trs_kernels.hip.cpp
index 0dd3d788d67..07bd0254531 100644
--- a/hip/solver/upper_trs_kernels.hip.cpp
+++ b/hip/solver/upper_trs_kernels.hip.cpp
@@ -69,20 +69,15 @@ void should_perform_transpose(std::shared_ptr<const HipExecutor> exec,
 }
 
 
-void init_struct(std::shared_ptr<const HipExecutor> exec,
-                 std::shared_ptr<solver::SolveStruct>& solve_struct)
-{
-    init_struct_kernel(exec, solve_struct);
-}
-
-
 template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const HipExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
-              solver::SolveStruct* solve_struct, const gko::size_type num_rhs)
+              std::shared_ptr<solver::SolveStruct>& solve_struct,
+              const gko::size_type num_rhs)
 {
-    generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct, num_rhs,
-                                          true);
+    init_struct_kernel(exec, solve_struct);
+    generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct.get(),
+                                          num_rhs, true);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/include/ginkgo/core/solver/lower_trs.hpp b/include/ginkgo/core/solver/lower_trs.hpp
index ab375c05406..94244ed1955 100644
--- a/include/ginkgo/core/solver/lower_trs.hpp
+++ b/include/ginkgo/core/solver/lower_trs.hpp
@@ -123,8 +123,6 @@ class LowerTrs : public EnableLinOp<LowerTrs<ValueType, IndexType>>,
     GKO_ENABLE_BUILD_METHOD(Factory);
 
 protected:
-    void init_trs_solve_struct();
-
     void apply_impl(const LinOp* b, LinOp* x) const override;
 
     void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
@@ -159,7 +157,6 @@ class LowerTrs : public EnableLinOp<LowerTrs<ValueType, IndexType>>,
             system_matrix_ =
                 copy_and_convert_to<CsrMatrix>(exec, system_matrix);
         }
-        this->init_trs_solve_struct();
         this->generate();
     }
 
diff --git a/include/ginkgo/core/solver/upper_trs.hpp b/include/ginkgo/core/solver/upper_trs.hpp
index 66f388d4823..5a49c094dea 100644
--- a/include/ginkgo/core/solver/upper_trs.hpp
+++ b/include/ginkgo/core/solver/upper_trs.hpp
@@ -123,8 +123,6 @@ class UpperTrs : public EnableLinOp<UpperTrs<ValueType, IndexType>>,
     GKO_ENABLE_BUILD_METHOD(Factory);
 
 protected:
-    void init_trs_solve_struct();
-
     void apply_impl(const LinOp* b, LinOp* x) const override;
 
     void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
@@ -159,7 +157,6 @@ class UpperTrs : public EnableLinOp<UpperTrs<ValueType, IndexType>>,
             system_matrix_ =
                 copy_and_convert_to<CsrMatrix>(exec, system_matrix);
         }
-        this->init_trs_solve_struct();
         this->generate();
     }
 
diff --git a/omp/solver/lower_trs_kernels.cpp b/omp/solver/lower_trs_kernels.cpp
index edc81943ee5..d8b296aeec6 100644
--- a/omp/solver/lower_trs_kernels.cpp
+++ b/omp/solver/lower_trs_kernels.cpp
@@ -66,18 +66,11 @@ void should_perform_transpose(std::shared_ptr<const OmpExecutor> exec,
 }
 
 
-void init_struct(std::shared_ptr<const OmpExecutor> exec,
-                 std::shared_ptr<solver::SolveStruct>& solve_struct)
-{
-    // This init kernel is here to allow initialization of the solve struct for
-    // a more sophisticated implementation as for other executors.
-}
-
-
 template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const OmpExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
-              solver::SolveStruct* solve_struct, const gko::size_type num_rhs)
+              std::shared_ptr<solver::SolveStruct>& solve_struct,
+              const gko::size_type num_rhs)
 {
     // This generate kernel is here to allow for a more sophisticated
     // implementation as for other executors. This kernel would perform the
diff --git a/omp/solver/upper_trs_kernels.cpp b/omp/solver/upper_trs_kernels.cpp
index 0952c339990..75b50cc65cb 100644
--- a/omp/solver/upper_trs_kernels.cpp
+++ b/omp/solver/upper_trs_kernels.cpp
@@ -66,18 +66,11 @@ void should_perform_transpose(std::shared_ptr<const OmpExecutor> exec,
 }
 
 
-void init_struct(std::shared_ptr<const OmpExecutor> exec,
-                 std::shared_ptr<solver::SolveStruct>& solve_struct)
-{
-    // This init kernel is here to allow initialization of the solve struct for
-    // a more sophisticated implementation as for other executors.
-}
-
-
 template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const OmpExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
-              solver::SolveStruct* solve_struct, const gko::size_type num_rhs)
+              std::shared_ptr<solver::SolveStruct>& solve_struct,
+              const gko::size_type num_rhs)
 {
     // This generate kernel is here to allow for a more sophisticated
     // implementation as for other executors. This kernel would perform the
diff --git a/omp/test/solver/lower_trs_kernels.cpp b/omp/test/solver/lower_trs_kernels.cpp
index 1a15fbe5647..eca1e1ea0bf 100644
--- a/omp/test/solver/lower_trs_kernels.cpp
+++ b/omp/test/solver/lower_trs_kernels.cpp
@@ -139,20 +139,13 @@ TEST_F(LowerTrs, OmpLowerTrsFlagCheckIsCorrect)
 }
 
 
-TEST_F(LowerTrs, OmpLowerTrsSolveStructInitIsEquivalentToRef)
-{
-    gko::kernels::reference::lower_trs::init_struct(ref, solve_struct_ref);
-    gko::kernels::omp::lower_trs::init_struct(omp, solve_struct_omp);
-}
-
-
 TEST_F(LowerTrs, OmpLowerTrsGenerateIsEquivalentToRef)
 {
     gko::size_type num_rhs = 1;
-    gko::kernels::reference::lower_trs::generate(
-        ref, csr_mat.get(), solve_struct_ref.get(), num_rhs);
+    gko::kernels::reference::lower_trs::generate(ref, csr_mat.get(),
+                                                 solve_struct_ref, num_rhs);
     gko::kernels::omp::lower_trs::generate(omp, d_csr_mat.get(),
-                                           solve_struct_omp.get(), num_rhs);
+                                           solve_struct_omp, num_rhs);
 }
 
 
@@ -160,8 +153,6 @@ TEST_F(LowerTrs, OmpLowerTrsSolveIsEquivalentToRef)
 {
     initialize_data(59, 43);
 
-    gko::kernels::reference::lower_trs::init_struct(ref, solve_struct_ref);
-    gko::kernels::omp::lower_trs::init_struct(omp, solve_struct_omp);
     gko::kernels::reference::lower_trs::solve(ref, csr_mat.get(),
                                               solve_struct_ref.get(), t_b.get(),
                                               t_x.get(), b.get(), x.get());
diff --git a/omp/test/solver/upper_trs_kernels.cpp b/omp/test/solver/upper_trs_kernels.cpp
index d3381167f43..53d5822f617 100644
--- a/omp/test/solver/upper_trs_kernels.cpp
+++ b/omp/test/solver/upper_trs_kernels.cpp
@@ -138,20 +138,13 @@ TEST_F(UpperTrs, OmpUpperTrsFlagCheckIsCorrect)
 }
 
 
-TEST_F(UpperTrs, OmpUpperTrsSolveStructInitIsEquivalentToRef)
-{
-    gko::kernels::reference::upper_trs::init_struct(ref, solve_struct_ref);
-    gko::kernels::omp::upper_trs::init_struct(omp, solve_struct_omp);
-}
-
-
 TEST_F(UpperTrs, OmpUpperTrsGenerateIsEquivalentToRef)
 {
     gko::size_type num_rhs = 1;
-    gko::kernels::reference::upper_trs::generate(
-        ref, csr_mat.get(), solve_struct_ref.get(), num_rhs);
+    gko::kernels::reference::upper_trs::generate(ref, csr_mat.get(),
+                                                 solve_struct_ref, num_rhs);
     gko::kernels::omp::upper_trs::generate(omp, d_csr_mat.get(),
-                                           solve_struct_omp.get(), num_rhs);
+                                           solve_struct_omp, num_rhs);
 }
 
 
@@ -159,8 +152,6 @@ TEST_F(UpperTrs, OmpUpperTrsSolveIsEquivalentToRef)
 {
     initialize_data(59, 43);
 
-    gko::kernels::reference::upper_trs::init_struct(ref, solve_struct_ref);
-    gko::kernels::omp::upper_trs::init_struct(omp, solve_struct_omp);
     gko::kernels::reference::upper_trs::solve(ref, csr_mat.get(),
                                               solve_struct_ref.get(), t_b.get(),
                                               t_x.get(), b.get(), x.get());
diff --git a/reference/solver/lower_trs_kernels.cpp b/reference/solver/lower_trs_kernels.cpp
index aca478c77da..a521d622d87 100644
--- a/reference/solver/lower_trs_kernels.cpp
+++ b/reference/solver/lower_trs_kernels.cpp
@@ -62,18 +62,11 @@ void should_perform_transpose(std::shared_ptr<const ReferenceExecutor> exec,
 }
 
 
-void init_struct(std::shared_ptr<const ReferenceExecutor> exec,
-                 std::shared_ptr<solver::SolveStruct>& solve_struct)
-{
-    // This init kernel is here to allow initialization of the solve struct for
-    // a more sophisticated implementation as for other executors.
-}
-
-
 template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const ReferenceExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
-              solver::SolveStruct* solve_struct, const gko::size_type num_rhs)
+              std::shared_ptr<solver::SolveStruct>& solve_struct,
+              const gko::size_type num_rhs)
 {
     // This generate kernel is here to allow for a more sophisticated
     // implementation as for other executors. This kernel would perform the
diff --git a/reference/solver/upper_trs_kernels.cpp b/reference/solver/upper_trs_kernels.cpp
index 46dbbd423a7..d08924798b0 100644
--- a/reference/solver/upper_trs_kernels.cpp
+++ b/reference/solver/upper_trs_kernels.cpp
@@ -62,18 +62,11 @@ void should_perform_transpose(std::shared_ptr<const ReferenceExecutor> exec,
 }
 
 
-void init_struct(std::shared_ptr<const ReferenceExecutor> exec,
-                 std::shared_ptr<solver::SolveStruct>& solve_struct)
-{
-    // This init kernel is here to allow initialization of the solve struct for
-    // a more sophisticated implementation as for other executors.
-}
-
-
 template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const ReferenceExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,
-              solver::SolveStruct* solve_struct, const gko::size_type num_rhs)
+              std::shared_ptr<solver::SolveStruct>& solve_struct,
+              const gko::size_type num_rhs)
 {
     // This generate kernel is here to allow for a more sophisticated
     // implementation as for other executors. This kernel would perform the

From 5f603397f42127b80c09957653d86239509722c8 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Fri, 27 Aug 2021 09:10:19 +0200
Subject: [PATCH 07/15] use custom implementation for upper_trs

---
 cuda/solver/common_trs_kernels.cuh     | 197 +++++++++++++++++++++++++
 cuda/solver/lower_trs_kernels.cu       | 188 +----------------------
 cuda/solver/upper_trs_kernels.cu       |  14 +-
 cuda/test/solver/lower_trs_kernels.cpp |  16 ++
 cuda/test/solver/upper_trs_kernels.cpp |  16 ++
 5 files changed, 240 insertions(+), 191 deletions(-)

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 20fc525a1a6..7c1d68255b7 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -53,6 +53,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/math.hpp"
 #include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/thread_ids.cuh"
+#include "cuda/components/uninitialized_array.hpp"
 
 
 namespace gko {
@@ -318,6 +320,201 @@ void solve_kernel(std::shared_ptr<const CudaExecutor> exec,
 }
 
 
+constexpr int default_block_size = 512;
+
+
+template <typename ValueType>
+__device__ std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType>
+load(const ValueType *values, int index)
+{
+    const volatile ValueType *val = values + index;
+    return *val;
+}
+
+template <typename ValueType>
+__device__ std::enable_if_t<std::is_floating_point<ValueType>::value,
+                            thrust::complex<ValueType>>
+load(const thrust::complex<ValueType> *values, int index)
+{
+    auto real = reinterpret_cast<const ValueType *>(values);
+    auto imag = real + 1;
+    return {load(real, 2 * index), load(imag, 2 * index)};
+}
+
+template <typename ValueType>
+__device__ void store(
+    ValueType *values, int index,
+    std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType> value)
+{
+    volatile ValueType *val = values + index;
+    *val = value;
+}
+
+template <typename ValueType>
+__device__ void store(thrust::complex<ValueType> *values, int index,
+                      thrust::complex<ValueType> value)
+{
+    auto real = reinterpret_cast<ValueType *>(values);
+    auto imag = real + 1;
+    store(real, 2 * index, value.real());
+    store(imag, 2 * index, value.imag());
+}
+
+
+template <bool is_upper, typename ValueType, typename IndexType>
+__global__ void sptrsv_naive_caching_kernel(
+    const IndexType *const rowptrs, const IndexType *const colidxs,
+    const ValueType *const vals, const ValueType *const b, ValueType *const x,
+    const size_type n, bool *nan_produced)
+{
+    const auto gid = thread::get_thread_id_flat<IndexType>();
+    const auto row = is_upper ? n - 1 - gid : gid;
+
+    if (gid >= n) {
+        return;
+    }
+
+    const auto self_shmem_id = gid / default_block_size;
+    const auto self_shid = gid % default_block_size;
+
+    __shared__ UninitializedArray<ValueType, default_block_size> x_s_array;
+    ValueType *x_s = x_s_array;
+    x_s[self_shid] = nan<ValueType>();
+
+    __syncthreads();
+
+    // lower tri matrix: start at beginning, run forward until last entry,
+    // (row_end - 1) which is the diagonal entry
+    // upper tri matrix: start at last entry (row_end - 1), run backward
+    // until first entry, which is the diagonal entry
+    const auto row_begin = is_upper ? rowptrs[row + 1] - 1 : rowptrs[row];
+    const auto row_diag = is_upper ? rowptrs[row] : rowptrs[row + 1] - 1;
+    const int row_step = is_upper ? -1 : 1;
+
+    ValueType sum = 0.0;
+    for (auto i = row_begin; i != row_diag; i += row_step) {
+        const auto dependency = colidxs[i];
+        auto x_p = &x[dependency];
+
+        const auto dependency_gid = is_upper ? n - 1 - dependency : dependency;
+        const bool shmem_possible =
+            dependency_gid / default_block_size == self_shmem_id;
+        if (shmem_possible) {
+            const auto dependency_shid = dependency_gid % default_block_size;
+            x_p = &x_s[dependency_shid];
+        }
+
+        ValueType x = *x_p;
+        while (is_nan(x)) {
+            x = load(x_p, 0);
+        }
+
+        sum += x * vals[i];
+    }
+
+    const auto r = (b[row] - sum) / vals[row_diag];
+
+    store(x_s, self_shid, r);
+    x[row] = r;
+
+    // This check to ensure no infinte loops happen.
+    if (is_nan(r)) {
+        store(x_s, self_shid, zero<ValueType>());
+        x[row] = zero<ValueType>();
+        *nan_produced = true;
+    }
+}
+
+
+template <bool is_upper, typename ValueType, typename IndexType>
+__global__ void sptrsv_naive_legacy_kernel(
+    const IndexType *const rowptrs, const IndexType *const colidxs,
+    const ValueType *const vals, const ValueType *const b, ValueType *const x,
+    const size_type n, bool *nan_produced)
+{
+    const auto gid = thread::get_thread_id_flat<IndexType>();
+    const auto row = is_upper ? n - 1 - gid : gid;
+
+    if (gid >= n) {
+        return;
+    }
+
+    // lower tri matrix: start at beginning, run forward until last entry,
+    // (row_end - 1) which is the diagonal entry
+    // upper tri matrix: start at last entry (row_end - 1), run backward
+    // until first entry, which is the diagonal entry
+    const auto row_begin = is_upper ? rowptrs[row + 1] - 1 : rowptrs[row];
+    const auto row_diag = is_upper ? rowptrs[row] : rowptrs[row + 1] - 1;
+    const int row_step = is_upper ? -1 : 1;
+
+    ValueType sum = 0.0;
+    auto j = row_begin;
+    while (j != row_diag + row_step) {
+        auto col = colidxs[j];
+        while (!is_nan(load(x, col))) {
+            sum += vals[j] * load(x, col);
+            j += row_step;
+            col = colidxs[j];
+        }
+        if (row == col) {
+            const auto r = (b[row] - sum) / vals[row_diag];
+            store(x, row, r);
+            j += row_step;
+            if (is_nan(r)) {
+                store(x, row, zero<ValueType>());
+                *nan_produced = true;
+            }
+        }
+    }
+}
+
+
+template <bool is_upper, typename ValueType, typename IndexType>
+void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
+                          const matrix::Csr<ValueType, IndexType> *matrix,
+                          const matrix::Dense<ValueType> *b,
+                          matrix::Dense<ValueType> *x)
+{
+    // Pre-Volta GPUs may deadlock due to missing independent thread scheduling.
+    const auto is_fallback_required = exec->get_major_version() < 7;
+
+    const IndexType n = matrix->get_size()[0];
+
+    // Initialize x to all NaNs.
+    cudaMemset(x->get_values(), 0xFF, n * sizeof(ValueType));
+
+    Array<bool> nan_produced(exec, 1);
+    cudaMemset(nan_produced.get_data(), false, sizeof(bool));
+
+    const dim3 block_size(is_fallback_required ? 32 : default_block_size, 1, 1);
+    const dim3 grid_size(ceildiv(n, block_size.x), 1, 1);
+
+    if (is_fallback_required) {
+        sptrsv_naive_legacy_kernel<is_upper><<<grid_size, block_size>>>(
+            matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+            as_cuda_type(matrix->get_const_values()),
+            as_cuda_type(b->get_const_values()), as_cuda_type(x->get_values()),
+            n, nan_produced.get_data());
+    } else {
+        sptrsv_naive_caching_kernel<is_upper><<<grid_size, block_size>>>(
+            matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
+            as_cuda_type(matrix->get_const_values()),
+            as_cuda_type(b->get_const_values()), as_cuda_type(x->get_values()),
+            n, nan_produced.get_data());
+    }
+
+    if (exec->copy_val_to_host(nan_produced.get_const_data())) {
+#if GKO_VERBOSE_LEVEL >= 1
+        std::cerr
+            << "Error: triangular solve produced NaN, either not all diagonal "
+               "elements are nonzero, or the system is very ill-conditioned. "
+               "The NaN will be replaced with a zero."
+            << std::endl;
+#endif  // GKO_VERBOSE_LEVEL >= 1
+    }
+}
+
+
 }  // namespace
 }  // namespace cuda
 }  // namespace kernels
diff --git a/cuda/solver/lower_trs_kernels.cu b/cuda/solver/lower_trs_kernels.cu
index e07c0d19679..5c16608a67d 100644
--- a/cuda/solver/lower_trs_kernels.cu
+++ b/cuda/solver/lower_trs_kernels.cu
@@ -48,8 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
-#include "cuda/components/thread_ids.cuh"
-#include "cuda/components/uninitialized_array.hpp"
 #include "cuda/solver/common_trs_kernels.cuh"
 
 
@@ -64,190 +62,6 @@ namespace cuda {
 namespace lower_trs {
 
 
-constexpr int default_block_size = 512;
-
-
-template <typename ValueType>
-__device__ std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType>
-load(const ValueType* values, int index)
-{
-    const volatile ValueType* val = values + index;
-    return *val;
-}
-
-template <typename ValueType>
-__device__ std::enable_if_t<std::is_floating_point<ValueType>::value,
-                            thrust::complex<ValueType>>
-load(const thrust::complex<ValueType>* values, int index)
-{
-    auto real = reinterpret_cast<const ValueType*>(values);
-    auto imag = real + 1;
-    return {load(real, 2 * index), load(imag, 2 * index)};
-}
-
-template <typename ValueType>
-__device__ void store(
-    ValueType* values, int index,
-    std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType> value)
-{
-    volatile ValueType* val = values + index;
-    *val = value;
-}
-
-template <typename ValueType>
-__device__ void store(thrust::complex<ValueType>* values, int index,
-                      thrust::complex<ValueType> value)
-{
-    auto real = reinterpret_cast<ValueType*>(values);
-    auto imag = real + 1;
-    store(real, 2 * index, value.real());
-    store(imag, 2 * index, value.imag());
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ void sptrsv_naive_caching_kernel(
-    const IndexType* const rowptrs, const IndexType* const colidxs,
-    const ValueType* const vals, const ValueType* const b, ValueType* const x,
-    const size_type n, bool* nan_produced)
-{
-    const auto gid = thread::get_thread_id_flat<IndexType>();
-    const auto row = gid;
-
-    if (row >= n) {
-        return;
-    }
-
-    const auto self_shmem_id = gid / default_block_size;
-    const auto self_shid = gid % default_block_size;
-
-    __shared__ UninitializedArray<ValueType, default_block_size> x_s_array;
-    ValueType* x_s = x_s_array;
-    x_s[self_shid] = nan<ValueType>();
-
-    __syncthreads();
-
-    const auto row_start = rowptrs[row];
-    const auto row_end = rowptrs[row + 1] - 1;
-
-    ValueType sum = 0.0;
-    for (IndexType i = row_start; i < row_end; ++i) {
-        const auto dependency = colidxs[i];
-        auto x_p = &x[dependency];
-
-        const bool shmem_possible =
-            dependency / default_block_size == self_shmem_id;
-        if (shmem_possible) {
-            const auto dependency_shid = dependency % default_block_size;
-            x_p = &x_s[dependency_shid];
-        }
-
-        ValueType x = *x_p;
-        while (is_nan(x)) {
-            x = load(x_p, 0);
-        }
-
-        sum += x * vals[i];
-    }
-
-    const auto r = (b[row] - sum) / vals[row_end];
-
-    store(x_s, self_shid, r);
-    x[row] = r;
-
-    // This check to ensure no infinte loops happen.
-    if (is_nan(r)) {
-        store(x_s, self_shid, zero<ValueType>());
-        x[row] = zero<ValueType>();
-        *nan_produced = true;
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-__global__ void sptrsv_naive_legacy_kernel(
-    const IndexType* const rowptrs, const IndexType* const colidxs,
-    const ValueType* const vals, const ValueType* const b, ValueType* const x,
-    const size_type n, bool* nan_produced)
-{
-    const auto gid = thread::get_thread_id_flat<IndexType>();
-    const auto row = gid;
-
-    if (row >= n) {
-        return;
-    }
-
-    const auto row_end = rowptrs[row + 1] - 1;
-
-    ValueType sum = 0.0;
-    auto j = rowptrs[row];
-    while (j <= row_end) {
-        auto col = colidxs[j];
-        while (!is_nan(load(x, col))) {
-            sum += vals[j] * load(x, col);
-            ++j;
-            col = colidxs[j];
-        }
-        if (row == col) {
-            const auto r = (b[row] - sum) / vals[row_end];
-            store(x, row, r);
-            ++j;
-            if (is_nan(r)) {
-                store(x, row, zero<ValueType>());
-                *nan_produced = true;
-            }
-        }
-    }
-}
-
-
-template <typename ValueType, typename IndexType>
-void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
-                          const matrix::Csr<ValueType, IndexType>* matrix,
-                          const matrix::Dense<ValueType>* b,
-                          matrix::Dense<ValueType>* x)
-{
-    // Pre-Volta GPUs may deadlock due to missing independent thread scheduling.
-    const auto is_fallback_required = exec->get_major_version() < 7;
-
-    const IndexType n = matrix->get_size()[0];
-
-    // Initialize x to all NaNs.
-    cudaMemset(x->get_values(), 0xFF, n * sizeof(ValueType));
-
-    Array<bool> nan_produced(exec, 1);
-    cudaMemset(nan_produced.get_data(), false, sizeof(bool));
-
-    const dim3 block_size(is_fallback_required ? 32 : default_block_size, 1, 1);
-    const dim3 grid_size(ceildiv(n, block_size.x), 1, 1);
-
-    if (is_fallback_required) {
-        sptrsv_naive_legacy_kernel<<<grid_size, block_size>>>(
-            matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
-            as_cuda_type(matrix->get_const_values()),
-            as_cuda_type(b->get_const_values()), as_cuda_type(x->get_values()),
-            n, nan_produced.get_data());
-    } else {
-        sptrsv_naive_caching_kernel<<<grid_size, block_size>>>(
-            matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
-            as_cuda_type(matrix->get_const_values()),
-            as_cuda_type(b->get_const_values()), as_cuda_type(x->get_values()),
-            n, nan_produced.get_data());
-    }
-
-    nan_produced.set_executor(exec->get_master());
-    if (nan_produced.get_const_data()[0]) {
-#if GKO_VERBOSE_LEVEL >= 1
-        std::cerr
-            << "Error: triangular solve produced NaN, either not all diagonal "
-               "elements are nonzero, or the system is very ill-conditioned. "
-               "The NaN will be replaced with a zero."
-            << std::endl;
-#endif  // GKO_VERBOSE_LEVEL >= 1
-    }
-}
-
-
 void should_perform_transpose(std::shared_ptr<const CudaExecutor> exec,
                               bool& do_transpose)
 {
@@ -282,7 +96,7 @@ void solve(std::shared_ptr<const CudaExecutor> exec,
         solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
                                            trans_x, b, x);
     } else {
-        sptrsv_naive_caching(exec, matrix, b, x);
+        sptrsv_naive_caching<false>(exec, matrix, b, x);
     }
 }
 
diff --git a/cuda/solver/upper_trs_kernels.cu b/cuda/solver/upper_trs_kernels.cu
index 1f6f9c283f2..eeecdd24d02 100644
--- a/cuda/solver/upper_trs_kernels.cu
+++ b/cuda/solver/upper_trs_kernels.cu
@@ -75,8 +75,10 @@ void generate(std::shared_ptr<const CudaExecutor> exec,
               std::shared_ptr<solver::SolveStruct>& solve_struct,
               const gko::size_type num_rhs)
 {
-    generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct, num_rhs,
-                                          true);
+    if (matrix->get_strategy()->get_name() == "sparselib") {
+        generate_kernel<ValueType, IndexType>(exec, matrix, solve_struct,
+                                              num_rhs, true);
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -90,8 +92,12 @@ void solve(std::shared_ptr<const CudaExecutor> exec,
            matrix::Dense<ValueType>* trans_b, matrix::Dense<ValueType>* trans_x,
            const matrix::Dense<ValueType>* b, matrix::Dense<ValueType>* x)
 {
-    solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
-                                       trans_x, b, x);
+    if (matrix->get_strategy()->get_name() == "sparselib") {
+        solve_kernel<ValueType, IndexType>(exec, matrix, solve_struct, trans_b,
+                                           trans_x, b, x);
+    } else {
+        sptrsv_naive_caching<true>(exec, matrix, b, x);
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cpp
index 1cba5558312..64e4fcdcf83 100644
--- a/cuda/test/solver/lower_trs_kernels.cpp
+++ b/cuda/test/solver/lower_trs_kernels.cpp
@@ -134,6 +134,22 @@ TEST_F(LowerTrs, CudaLowerTrsFlagCheckIsCorrect)
 }
 
 
+TEST_F(LowerTrs, CudaSingleRhsApplyClassicalIsEquivalentToRef)
+{
+    initialize_data(50, 1);
+    auto lower_trs_factory = gko::solver::LowerTrs<>::build().on(ref);
+    auto d_lower_trs_factory = gko::solver::LowerTrs<>::build().on(cuda);
+    d_csr_mtx->set_strategy(std::make_shared<CsrMtx::classical>());
+    auto solver = lower_trs_factory->generate(csr_mtx);
+    auto d_solver = d_lower_trs_factory->generate(d_csr_mtx);
+
+    solver->apply(b2.get(), x.get());
+    d_solver->apply(d_b2.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
 TEST_F(LowerTrs, CudaSingleRhsApplyIsEquivalentToRef)
 {
     initialize_data(50, 1);
diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp
index 78f1954ee3b..c2f1f120061 100644
--- a/cuda/test/solver/upper_trs_kernels.cpp
+++ b/cuda/test/solver/upper_trs_kernels.cpp
@@ -134,6 +134,22 @@ TEST_F(UpperTrs, CudaUpperTrsFlagCheckIsCorrect)
 }
 
 
+TEST_F(UpperTrs, CudaSingleRhsApplyClassicalIsEquivalentToRef)
+{
+    initialize_data(50, 1);
+    auto upper_trs_factory = gko::solver::UpperTrs<>::build().on(ref);
+    auto d_upper_trs_factory = gko::solver::UpperTrs<>::build().on(cuda);
+    d_csr_mtx->set_strategy(std::make_shared<CsrMtx::classical>());
+    auto solver = upper_trs_factory->generate(csr_mtx);
+    auto d_solver = d_upper_trs_factory->generate(d_csr_mtx);
+
+    solver->apply(b2.get(), x.get());
+    d_solver->apply(d_b2.get(), d_x.get());
+
+    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+}
+
+
 TEST_F(UpperTrs, CudaSingleRhsApplyIsEquivalentToRef)
 {
     initialize_data(50, 1);

From 905c3839c0557c88c8657bb17a4880a9dc459d5d Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 19 Oct 2021 22:12:05 +0200
Subject: [PATCH 08/15] pass different vectors to trisolve analysis

---
 cuda/solver/common_trs_kernels.cuh | 38 +++++++++++++++---------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 7c1d68255b7..71d0b27a13c 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -111,7 +111,7 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         // cusparse needs non-nullptr input vectors even for analysis
         auto descr_b = cusparse::create_dnmat(
             matrix->get_size()[0], num_rhs, matrix->get_size()[1],
-            reinterpret_cast<ValueType*>(0xFF));
+            reinterpret_cast<ValueType*>(0xCC));
         auto descr_c = cusparse::create_dnmat(
             matrix->get_size()[0], num_rhs, matrix->get_size()[1],
             reinterpret_cast<ValueType*>(0xFF));
@@ -325,36 +325,36 @@ constexpr int default_block_size = 512;
 
 template <typename ValueType>
 __device__ std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType>
-load(const ValueType *values, int index)
+load(const ValueType* values, int index)
 {
-    const volatile ValueType *val = values + index;
+    const volatile ValueType* val = values + index;
     return *val;
 }
 
 template <typename ValueType>
 __device__ std::enable_if_t<std::is_floating_point<ValueType>::value,
                             thrust::complex<ValueType>>
-load(const thrust::complex<ValueType> *values, int index)
+load(const thrust::complex<ValueType>* values, int index)
 {
-    auto real = reinterpret_cast<const ValueType *>(values);
+    auto real = reinterpret_cast<const ValueType*>(values);
     auto imag = real + 1;
     return {load(real, 2 * index), load(imag, 2 * index)};
 }
 
 template <typename ValueType>
 __device__ void store(
-    ValueType *values, int index,
+    ValueType* values, int index,
     std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType> value)
 {
-    volatile ValueType *val = values + index;
+    volatile ValueType* val = values + index;
     *val = value;
 }
 
 template <typename ValueType>
-__device__ void store(thrust::complex<ValueType> *values, int index,
+__device__ void store(thrust::complex<ValueType>* values, int index,
                       thrust::complex<ValueType> value)
 {
-    auto real = reinterpret_cast<ValueType *>(values);
+    auto real = reinterpret_cast<ValueType*>(values);
     auto imag = real + 1;
     store(real, 2 * index, value.real());
     store(imag, 2 * index, value.imag());
@@ -363,9 +363,9 @@ __device__ void store(thrust::complex<ValueType> *values, int index,
 
 template <bool is_upper, typename ValueType, typename IndexType>
 __global__ void sptrsv_naive_caching_kernel(
-    const IndexType *const rowptrs, const IndexType *const colidxs,
-    const ValueType *const vals, const ValueType *const b, ValueType *const x,
-    const size_type n, bool *nan_produced)
+    const IndexType* const rowptrs, const IndexType* const colidxs,
+    const ValueType* const vals, const ValueType* const b, ValueType* const x,
+    const size_type n, bool* nan_produced)
 {
     const auto gid = thread::get_thread_id_flat<IndexType>();
     const auto row = is_upper ? n - 1 - gid : gid;
@@ -378,7 +378,7 @@ __global__ void sptrsv_naive_caching_kernel(
     const auto self_shid = gid % default_block_size;
 
     __shared__ UninitializedArray<ValueType, default_block_size> x_s_array;
-    ValueType *x_s = x_s_array;
+    ValueType* x_s = x_s_array;
     x_s[self_shid] = nan<ValueType>();
 
     __syncthreads();
@@ -428,9 +428,9 @@ __global__ void sptrsv_naive_caching_kernel(
 
 template <bool is_upper, typename ValueType, typename IndexType>
 __global__ void sptrsv_naive_legacy_kernel(
-    const IndexType *const rowptrs, const IndexType *const colidxs,
-    const ValueType *const vals, const ValueType *const b, ValueType *const x,
-    const size_type n, bool *nan_produced)
+    const IndexType* const rowptrs, const IndexType* const colidxs,
+    const ValueType* const vals, const ValueType* const b, ValueType* const x,
+    const size_type n, bool* nan_produced)
 {
     const auto gid = thread::get_thread_id_flat<IndexType>();
     const auto row = is_upper ? n - 1 - gid : gid;
@@ -471,9 +471,9 @@ __global__ void sptrsv_naive_legacy_kernel(
 
 template <bool is_upper, typename ValueType, typename IndexType>
 void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
-                          const matrix::Csr<ValueType, IndexType> *matrix,
-                          const matrix::Dense<ValueType> *b,
-                          matrix::Dense<ValueType> *x)
+                          const matrix::Csr<ValueType, IndexType>* matrix,
+                          const matrix::Dense<ValueType>* b,
+                          matrix::Dense<ValueType>* x)
 {
     // Pre-Volta GPUs may deadlock due to missing independent thread scheduling.
     const auto is_fallback_required = exec->get_major_version() < 7;

From bee6006e82d8b65ed9528daa8274e7aee4035838 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Tue, 19 Oct 2021 22:19:10 +0200
Subject: [PATCH 09/15] add support for multiple RHS to custom trisolve

---
 cuda/solver/common_trs_kernels.cuh     | 65 +++++++++++++++-----------
 cuda/test/solver/lower_trs_kernels.cpp | 28 ++++++++++-
 cuda/test/solver/upper_trs_kernels.cpp | 28 ++++++++++-
 3 files changed, 89 insertions(+), 32 deletions(-)

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 71d0b27a13c..475212d20ca 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -364,18 +364,21 @@ __device__ void store(thrust::complex<ValueType>* values, int index,
 template <bool is_upper, typename ValueType, typename IndexType>
 __global__ void sptrsv_naive_caching_kernel(
     const IndexType* const rowptrs, const IndexType* const colidxs,
-    const ValueType* const vals, const ValueType* const b, ValueType* const x,
-    const size_type n, bool* nan_produced)
+    const ValueType* const vals, const ValueType* const b, size_type b_stride,
+    ValueType* const x, size_type x_stride, const size_type n,
+    const size_type nrhs, bool* nan_produced)
 {
-    const auto gid = thread::get_thread_id_flat<IndexType>();
+    const auto full_gid = thread::get_thread_id_flat<IndexType>();
+    const auto rhs = full_gid % nrhs;
+    const auto gid = full_gid / nrhs;
     const auto row = is_upper ? n - 1 - gid : gid;
 
     if (gid >= n) {
         return;
     }
 
-    const auto self_shmem_id = gid / default_block_size;
-    const auto self_shid = gid % default_block_size;
+    const auto self_shmem_id = full_gid / default_block_size;
+    const auto self_shid = full_gid % default_block_size;
 
     __shared__ UninitializedArray<ValueType, default_block_size> x_s_array;
     ValueType* x_s = x_s_array;
@@ -394,11 +397,12 @@ __global__ void sptrsv_naive_caching_kernel(
     ValueType sum = 0.0;
     for (auto i = row_begin; i != row_diag; i += row_step) {
         const auto dependency = colidxs[i];
-        auto x_p = &x[dependency];
+        auto x_p = &x[dependency * x_stride + rhs];
 
-        const auto dependency_gid = is_upper ? n - 1 - dependency : dependency;
+        const auto dependency_gid = is_upper ? (n - 1 - dependency) * nrhs + rhs
+                                             : dependency * nrhs + rhs;
         const bool shmem_possible =
-            dependency_gid / default_block_size == self_shmem_id;
+            (dependency_gid / default_block_size) == self_shmem_id;
         if (shmem_possible) {
             const auto dependency_shid = dependency_gid % default_block_size;
             x_p = &x_s[dependency_shid];
@@ -412,15 +416,15 @@ __global__ void sptrsv_naive_caching_kernel(
         sum += x * vals[i];
     }
 
-    const auto r = (b[row] - sum) / vals[row_diag];
+    const auto r = (b[row * b_stride + rhs] - sum) / vals[row_diag];
 
     store(x_s, self_shid, r);
-    x[row] = r;
+    x[row * x_stride + rhs] = r;
 
     // This check to ensure no infinte loops happen.
     if (is_nan(r)) {
         store(x_s, self_shid, zero<ValueType>());
-        x[row] = zero<ValueType>();
+        x[row * x_stride + rhs] = zero<ValueType>();
         *nan_produced = true;
     }
 }
@@ -429,10 +433,13 @@ __global__ void sptrsv_naive_caching_kernel(
 template <bool is_upper, typename ValueType, typename IndexType>
 __global__ void sptrsv_naive_legacy_kernel(
     const IndexType* const rowptrs, const IndexType* const colidxs,
-    const ValueType* const vals, const ValueType* const b, ValueType* const x,
-    const size_type n, bool* nan_produced)
+    const ValueType* const vals, const ValueType* const b, size_type b_stride,
+    ValueType* const x, size_type x_stride, const size_type n,
+    const size_type nrhs, bool* nan_produced)
 {
-    const auto gid = thread::get_thread_id_flat<IndexType>();
+    const auto full_gid = thread::get_thread_id_flat<IndexType>();
+    const auto rhs = full_gid % nrhs;
+    const auto gid = full_gid / nrhs;
     const auto row = is_upper ? n - 1 - gid : gid;
 
     if (gid >= n) {
@@ -451,17 +458,17 @@ __global__ void sptrsv_naive_legacy_kernel(
     auto j = row_begin;
     while (j != row_diag + row_step) {
         auto col = colidxs[j];
-        while (!is_nan(load(x, col))) {
-            sum += vals[j] * load(x, col);
+        while (!is_nan(load(x, col * x_stride + rhs))) {
+            sum += vals[j] * load(x, col * x_stride + rhs);
             j += row_step;
             col = colidxs[j];
         }
         if (row == col) {
-            const auto r = (b[row] - sum) / vals[row_diag];
-            store(x, row, r);
+            const auto r = (b[row * b_stride + rhs] - sum) / vals[row_diag];
+            store(x, row * x_stride + rhs, r);
             j += row_step;
             if (is_nan(r)) {
-                store(x, row, zero<ValueType>());
+                store(x, row * x_stride + rhs, zero<ValueType>());
                 *nan_produced = true;
             }
         }
@@ -478,29 +485,31 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
     // Pre-Volta GPUs may deadlock due to missing independent thread scheduling.
     const auto is_fallback_required = exec->get_major_version() < 7;
 
-    const IndexType n = matrix->get_size()[0];
+    const auto n = matrix->get_size()[0];
+    const auto nrhs = b->get_size()[1];
 
     // Initialize x to all NaNs.
-    cudaMemset(x->get_values(), 0xFF, n * sizeof(ValueType));
+    x->fill(nan<ValueType>());
 
-    Array<bool> nan_produced(exec, 1);
-    cudaMemset(nan_produced.get_data(), false, sizeof(bool));
+    Array<bool> nan_produced(exec, {false});
 
     const dim3 block_size(is_fallback_required ? 32 : default_block_size, 1, 1);
-    const dim3 grid_size(ceildiv(n, block_size.x), 1, 1);
+    const dim3 grid_size(ceildiv(n * nrhs, block_size.x), 1, 1);
 
     if (is_fallback_required) {
         sptrsv_naive_legacy_kernel<is_upper><<<grid_size, block_size>>>(
             matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
             as_cuda_type(matrix->get_const_values()),
-            as_cuda_type(b->get_const_values()), as_cuda_type(x->get_values()),
-            n, nan_produced.get_data());
+            as_cuda_type(b->get_const_values()), b->get_stride(),
+            as_cuda_type(x->get_values()), x->get_stride(), n, nrhs,
+            nan_produced.get_data());
     } else {
         sptrsv_naive_caching_kernel<is_upper><<<grid_size, block_size>>>(
             matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
             as_cuda_type(matrix->get_const_values()),
-            as_cuda_type(b->get_const_values()), as_cuda_type(x->get_values()),
-            n, nan_produced.get_data());
+            as_cuda_type(b->get_const_values()), b->get_stride(),
+            as_cuda_type(x->get_values()), x->get_stride(), n, nrhs,
+            nan_produced.get_data());
     }
 
     if (exec->copy_val_to_host(nan_produced.get_const_data())) {
diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cpp
index 64e4fcdcf83..514562f97fd 100644
--- a/cuda/test/solver/lower_trs_kernels.cpp
+++ b/cuda/test/solver/lower_trs_kernels.cpp
@@ -165,6 +165,27 @@ TEST_F(LowerTrs, CudaSingleRhsApplyIsEquivalentToRef)
 }
 
 
+TEST_F(LowerTrs, CudaMultipleRhsApplyClassicalIsEquivalentToRef)
+{
+    initialize_data(50, 3);
+    auto lower_trs_factory =
+        gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(ref);
+    auto d_lower_trs_factory =
+        gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(cuda);
+    d_csr_mtx->set_strategy(std::make_shared<CsrMtx::classical>());
+    auto solver = lower_trs_factory->generate(csr_mtx);
+    auto d_solver = d_lower_trs_factory->generate(d_csr_mtx);
+    auto db2_strided = Mtx::create(cuda, b->get_size(), 4);
+    d_b2->convert_to(db2_strided.get());
+    auto dx_strided = Mtx::create(cuda, x->get_size(), 5);
+
+    solver->apply(b2.get(), x.get());
+    d_solver->apply(db2_strided.get(), dx_strided.get());
+
+    GKO_ASSERT_MTX_NEAR(dx_strided, x, 1e-14);
+}
+
+
 TEST_F(LowerTrs, CudaMultipleRhsApplyIsEquivalentToRef)
 {
     initialize_data(50, 3);
@@ -174,11 +195,14 @@ TEST_F(LowerTrs, CudaMultipleRhsApplyIsEquivalentToRef)
         gko::solver::LowerTrs<>::build().with_num_rhs(3u).on(cuda);
     auto solver = lower_trs_factory->generate(csr_mtx);
     auto d_solver = d_lower_trs_factory->generate(d_csr_mtx);
+    auto db2_strided = Mtx::create(cuda, b->get_size(), 4);
+    d_b2->convert_to(db2_strided.get());
+    auto dx_strided = Mtx::create(cuda, x->get_size(), 5);
 
     solver->apply(b2.get(), x.get());
-    d_solver->apply(d_b2.get(), d_x.get());
+    d_solver->apply(db2_strided.get(), dx_strided.get());
 
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx_strided, x, 1e-14);
 }
 
 
diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp
index c2f1f120061..13f6907141d 100644
--- a/cuda/test/solver/upper_trs_kernels.cpp
+++ b/cuda/test/solver/upper_trs_kernels.cpp
@@ -165,6 +165,27 @@ TEST_F(UpperTrs, CudaSingleRhsApplyIsEquivalentToRef)
 }
 
 
+TEST_F(UpperTrs, CudaMultipleRhsApplyClassicalIsEquivalentToRef)
+{
+    initialize_data(50, 3);
+    auto upper_trs_factory =
+        gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(ref);
+    auto d_upper_trs_factory =
+        gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(cuda);
+    d_csr_mtx->set_strategy(std::make_shared<CsrMtx::classical>());
+    auto solver = upper_trs_factory->generate(csr_mtx);
+    auto d_solver = d_upper_trs_factory->generate(d_csr_mtx);
+    auto db2_strided = Mtx::create(cuda, b->get_size(), 4);
+    d_b2->convert_to(db2_strided.get());
+    auto dx_strided = Mtx::create(cuda, x->get_size(), 5);
+
+    solver->apply(b2.get(), x.get());
+    d_solver->apply(db2_strided.get(), dx_strided.get());
+
+    GKO_ASSERT_MTX_NEAR(dx_strided, x, 1e-14);
+}
+
+
 TEST_F(UpperTrs, CudaMultipleRhsApplyIsEquivalentToRef)
 {
     initialize_data(50, 3);
@@ -174,11 +195,14 @@ TEST_F(UpperTrs, CudaMultipleRhsApplyIsEquivalentToRef)
         gko::solver::UpperTrs<>::build().with_num_rhs(3u).on(cuda);
     auto solver = upper_trs_factory->generate(csr_mtx);
     auto d_solver = d_upper_trs_factory->generate(d_csr_mtx);
+    auto db2_strided = Mtx::create(cuda, b->get_size(), 4);
+    d_b2->convert_to(db2_strided.get());
+    auto dx_strided = Mtx::create(cuda, x->get_size(), 5);
 
     solver->apply(b2.get(), x.get());
-    d_solver->apply(d_b2.get(), d_x.get());
+    d_solver->apply(db2_strided.get(), dx_strided.get());
 
-    GKO_ASSERT_MTX_NEAR(d_x, x, 1e-14);
+    GKO_ASSERT_MTX_NEAR(dx_strided, x, 1e-14);
 }
 
 

From ccd0dea89eaa8d48b1b9620600453f485f4fe459 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 3 Nov 2021 10:45:59 +0100
Subject: [PATCH 10/15] fix potential threadblock-level deadlock

---
 cuda/solver/common_trs_kernels.cuh | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 475212d20ca..93bedaeaed3 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -53,6 +53,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/math.hpp"
 #include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/types.hpp"
+#include "cuda/components/atomic.cuh"
 #include "cuda/components/thread_ids.cuh"
 #include "cuda/components/uninitialized_array.hpp"
 
@@ -321,6 +322,7 @@ void solve_kernel(std::shared_ptr<const CudaExecutor> exec,
 
 
 constexpr int default_block_size = 512;
+constexpr int fallback_block_size = 32;
 
 
 template <typename ValueType>
@@ -366,9 +368,15 @@ __global__ void sptrsv_naive_caching_kernel(
     const IndexType* const rowptrs, const IndexType* const colidxs,
     const ValueType* const vals, const ValueType* const b, size_type b_stride,
     ValueType* const x, size_type x_stride, const size_type n,
-    const size_type nrhs, bool* nan_produced)
+    const size_type nrhs, bool* nan_produced, IndexType* atomic_counter)
 {
-    const auto full_gid = thread::get_thread_id_flat<IndexType>();
+    __shared__ IndexType block_base_idx;
+    if (threadIdx.x == 0) {
+        block_base_idx =
+            atomic_add(atomic_counter, IndexType{1}) * default_block_size;
+    }
+    __syncthreads();
+    const auto full_gid = static_cast<IndexType>(threadIdx.x) + block_base_idx;
     const auto rhs = full_gid % nrhs;
     const auto gid = full_gid / nrhs;
     const auto row = is_upper ? n - 1 - gid : gid;
@@ -435,9 +443,15 @@ __global__ void sptrsv_naive_legacy_kernel(
     const IndexType* const rowptrs, const IndexType* const colidxs,
     const ValueType* const vals, const ValueType* const b, size_type b_stride,
     ValueType* const x, size_type x_stride, const size_type n,
-    const size_type nrhs, bool* nan_produced)
+    const size_type nrhs, bool* nan_produced, IndexType* atomic_counter)
 {
-    const auto full_gid = thread::get_thread_id_flat<IndexType>();
+    __shared__ IndexType block_base_idx;
+    if (threadIdx.x == 0) {
+        block_base_idx =
+            atomic_add(atomic_counter, IndexType{1}) * fallback_block_size;
+    }
+    __syncthreads();
+    const auto full_gid = static_cast<IndexType>(threadIdx.x) + block_base_idx;
     const auto rhs = full_gid % nrhs;
     const auto gid = full_gid / nrhs;
     const auto row = is_upper ? n - 1 - gid : gid;
@@ -492,8 +506,10 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
     x->fill(nan<ValueType>());
 
     Array<bool> nan_produced(exec, {false});
+    Array<IndexType> atomic_counter(exec, {IndexType{}});
 
-    const dim3 block_size(is_fallback_required ? 32 : default_block_size, 1, 1);
+    const dim3 block_size(
+        is_fallback_required ? fallback_block_size : default_block_size, 1, 1);
     const dim3 grid_size(ceildiv(n * nrhs, block_size.x), 1, 1);
 
     if (is_fallback_required) {
@@ -502,14 +518,14 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
             as_cuda_type(matrix->get_const_values()),
             as_cuda_type(b->get_const_values()), b->get_stride(),
             as_cuda_type(x->get_values()), x->get_stride(), n, nrhs,
-            nan_produced.get_data());
+            nan_produced.get_data(), atomic_counter.get_data());
     } else {
         sptrsv_naive_caching_kernel<is_upper><<<grid_size, block_size>>>(
             matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
             as_cuda_type(matrix->get_const_values()),
             as_cuda_type(b->get_const_values()), b->get_stride(),
             as_cuda_type(x->get_values()), x->get_stride(), n, nrhs,
-            nan_produced.get_data());
+            nan_produced.get_data(), atomic_counter.get_data());
     }
 
     if (exec->copy_val_to_host(nan_produced.get_const_data())) {

From 420d6db5fa6a7dcdce054591ec207aa83cb7a15b Mon Sep 17 00:00:00 2001
From: Lukas Riemer <lksriemer@gmail.com>
Date: Sun, 14 Nov 2021 19:44:57 +0100
Subject: [PATCH 11/15] Review updates

---
 cuda/base/cusparse_bindings.hpp    | 18 ++++-----
 cuda/solver/common_trs_kernels.cuh | 59 +++++++++++++++++++-----------
 include/ginkgo/core/base/math.hpp  |  8 ++--
 3 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp
index 5551bede3f7..cd43bccb60f 100644
--- a/cuda/base/cusparse_bindings.hpp
+++ b/cuda/base/cusparse_bindings.hpp
@@ -559,9 +559,9 @@ void spgemm_copy(cusparseHandle_t handle, const ValueType* alpha,
 
 inline size_type sparse_matrix_nnz(cusparseSpMatDescr_t descr)
 {
-    int64_t dummy1{};
-    int64_t dummy2{};
-    int64_t nnz{};
+    int64 dummy1{};
+    int64 dummy2{};
+    int64 nnz{};
     cusparseSpMatGetSize(descr, &dummy1, &dummy2, &nnz);
     return static_cast<size_type>(nnz);
 }
@@ -803,7 +803,7 @@ inline void destroy(cusparseSpGEMMDescr_t info)
 
 
 template <typename ValueType>
-inline cusparseDnVecDescr_t create_dnvec(int64_t size, ValueType* values)
+inline cusparseDnVecDescr_t create_dnvec(int64 size, ValueType* values)
 {
     cusparseDnVecDescr_t descr{};
     constexpr auto value_type = cuda_data_type<ValueType>();
@@ -820,8 +820,8 @@ inline void destroy(cusparseDnVecDescr_t descr)
 
 
 template <typename ValueType>
-inline cusparseDnMatDescr_t create_dnmat(int64_t rows, int64_t cols,
-                                         int64_t stride, ValueType* values)
+inline cusparseDnMatDescr_t create_dnmat(int64 rows, int64 cols, int64 stride,
+                                         ValueType* values)
 {
     cusparseDnMatDescr_t descr{};
     constexpr auto value_type = cuda_data_type<ValueType>();
@@ -838,7 +838,7 @@ inline void destroy(cusparseDnMatDescr_t descr)
 
 
 template <typename ValueType, typename IndexType>
-inline cusparseSpVecDescr_t create_spvec(int64_t size, int64_t nnz,
+inline cusparseSpVecDescr_t create_spvec(int64 size, int64 nnz,
                                          IndexType* indices, ValueType* values)
 {
     cusparseSpVecDescr_t descr{};
@@ -858,7 +858,7 @@ inline void destroy(cusparseSpVecDescr_t descr)
 
 
 template <typename IndexType, typename ValueType>
-inline cusparseSpMatDescr_t create_csr(int64_t rows, int64_t cols, int64_t nnz,
+inline cusparseSpMatDescr_t create_csr(int64 rows, int64 cols, int64 nnz,
                                        IndexType* csrRowOffsets,
                                        IndexType* csrColInd,
                                        ValueType* csrValues)
@@ -1153,7 +1153,7 @@ void spsm_solve(cusparseHandle_t handle, cusparseOperation_t op_a,
 }
 
 
-#endif
+#endif  // (defined(CUDA_VERSION) && (CUDA_VERSION >= 11030))
 
 
 template <typename IndexType>
diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 93bedaeaed3..33bd6c2af6a 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -83,6 +83,8 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
     cusparseHandle_t handle;
     cusparseSpSMDescr_t spsm_descr;
     cusparseSpMatDescr_t descr_a;
+
+    // Implicit parameter in spsm_solve, therefore stored here.
     Array<char> work;
 
     CudaSolveStruct(std::shared_ptr<const gko::CudaExecutor> exec,
@@ -112,10 +114,10 @@ struct CudaSolveStruct : gko::solver::SolveStruct {
         // cusparse needs non-nullptr input vectors even for analysis
         auto descr_b = cusparse::create_dnmat(
             matrix->get_size()[0], num_rhs, matrix->get_size()[1],
-            reinterpret_cast<ValueType*>(0xCC));
+            reinterpret_cast<ValueType*>(0xDEAD));
         auto descr_c = cusparse::create_dnmat(
             matrix->get_size()[0], num_rhs, matrix->get_size()[1],
-            reinterpret_cast<ValueType*>(0xFF));
+            reinterpret_cast<ValueType*>(0xDEAF));
 
         auto work_size = cusparse::spsm_buffer_size(
             handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -325,36 +327,38 @@ constexpr int default_block_size = 512;
 constexpr int fallback_block_size = 32;
 
 
-template <typename ValueType>
-__device__ std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType>
-load(const ValueType* values, int index)
+template <typename ValueType, typename IndexType>
+__device__ __forceinline__
+    std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType>
+    load(const ValueType* values, IndexType index)
 {
     const volatile ValueType* val = values + index;
     return *val;
 }
 
-template <typename ValueType>
-__device__ std::enable_if_t<std::is_floating_point<ValueType>::value,
-                            thrust::complex<ValueType>>
-load(const thrust::complex<ValueType>* values, int index)
+template <typename ValueType, typename IndexType>
+__device__ __forceinline__ std::enable_if_t<
+    std::is_floating_point<ValueType>::value, thrust::complex<ValueType>>
+load(const thrust::complex<ValueType>* values, IndexType index)
 {
     auto real = reinterpret_cast<const ValueType*>(values);
     auto imag = real + 1;
     return {load(real, 2 * index), load(imag, 2 * index)};
 }
 
-template <typename ValueType>
-__device__ void store(
-    ValueType* values, int index,
+template <typename ValueType, typename IndexType>
+__device__ __forceinline__ void store(
+    ValueType* values, IndexType index,
     std::enable_if_t<std::is_floating_point<ValueType>::value, ValueType> value)
 {
     volatile ValueType* val = values + index;
     *val = value;
 }
 
-template <typename ValueType>
-__device__ void store(thrust::complex<ValueType>* values, int index,
-                      thrust::complex<ValueType> value)
+template <typename ValueType, typename IndexType>
+__device__ __forceinline__ void store(thrust::complex<ValueType>* values,
+                                      IndexType index,
+                                      thrust::complex<ValueType> value)
 {
     auto real = reinterpret_cast<ValueType*>(values);
     auto imag = real + 1;
@@ -370,7 +374,9 @@ __global__ void sptrsv_naive_caching_kernel(
     ValueType* const x, size_type x_stride, const size_type n,
     const size_type nrhs, bool* nan_produced, IndexType* atomic_counter)
 {
+    __shared__ UninitializedArray<ValueType, default_block_size> x_s_array;
     __shared__ IndexType block_base_idx;
+
     if (threadIdx.x == 0) {
         block_base_idx =
             atomic_add(atomic_counter, IndexType{1}) * default_block_size;
@@ -388,7 +394,6 @@ __global__ void sptrsv_naive_caching_kernel(
     const auto self_shmem_id = full_gid / default_block_size;
     const auto self_shid = full_gid % default_block_size;
 
-    __shared__ UninitializedArray<ValueType, default_block_size> x_s_array;
     ValueType* x_s = x_s_array;
     x_s[self_shid] = nan<ValueType>();
 
@@ -490,6 +495,15 @@ __global__ void sptrsv_naive_legacy_kernel(
 }
 
 
+template <typename IndexType>
+__global__ void sptrsv_init_kernel(bool* const nan_produced,
+                                   IndexType* const atomic_counter)
+{
+    *nan_produced = false;
+    *atomic_counter = IndexType{};
+}
+
+
 template <bool is_upper, typename ValueType, typename IndexType>
 void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
                           const matrix::Csr<ValueType, IndexType>* matrix,
@@ -505,8 +519,10 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
     // Initialize x to all NaNs.
     x->fill(nan<ValueType>());
 
-    Array<bool> nan_produced(exec, {false});
-    Array<IndexType> atomic_counter(exec, {IndexType{}});
+    Array<bool> nan_produced(exec, 1);
+    Array<IndexType> atomic_counter(exec, 1);
+    sptrsv_init_kernel<<<1, 1>>>(nan_produced.get_data(),
+                                 atomic_counter.get_data());
 
     const dim3 block_size(
         is_fallback_required ? fallback_block_size : default_block_size, 1, 1);
@@ -528,15 +544,14 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
             nan_produced.get_data(), atomic_counter.get_data());
     }
 
-    if (exec->copy_val_to_host(nan_produced.get_const_data())) {
 #if GKO_VERBOSE_LEVEL >= 1
+    if (exec->copy_val_to_host(nan_produced.get_const_data())) {
         std::cerr
             << "Error: triangular solve produced NaN, either not all diagonal "
                "elements are nonzero, or the system is very ill-conditioned. "
-               "The NaN will be replaced with a zero."
-            << std::endl;
-#endif  // GKO_VERBOSE_LEVEL >= 1
+               "The NaN will be replaced with a zero.\n";
     }
+#endif  // GKO_VERBOSE_LEVEL >= 1
 }
 
 
diff --git a/include/ginkgo/core/base/math.hpp b/include/ginkgo/core/base/math.hpp
index 8af4cde253b..915c4e5a262 100644
--- a/include/ginkgo/core/base/math.hpp
+++ b/include/ginkgo/core/base/math.hpp
@@ -1107,7 +1107,7 @@ GKO_INLINE GKO_ATTRIBUTES T safe_divide(T a, T b)
  */
 template <typename T>
 GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<!is_complex_s<T>::value, bool>
-is_nan(const T &value)
+is_nan(const T& value)
 {
     return std::isnan(value);
 }
@@ -1124,14 +1124,14 @@ is_nan(const T &value)
  */
 template <typename T>
 GKO_INLINE GKO_ATTRIBUTES std::enable_if_t<is_complex_s<T>::value, bool> is_nan(
-    const T &value)
+    const T& value)
 {
     return std::isnan(value.real()) || std::isnan(value.imag());
 }
 
 
 /**
- * Returns NaN of the given type.
+ * Returns a quiet NaN of the given type.
  *
  * @tparam T  the type of the object
  *
@@ -1146,7 +1146,7 @@ nan()
 
 
 /**
- * Returns a complex with both components NaN.
+ * Returns a complex with both components quiet NaN.
  *
  * @tparam T  the type of the object
  *

From 73c45ad91bb686a3c25e1cf31c3f989ee250cb48 Mon Sep 17 00:00:00 2001
From: Lukas Riemer <lksriemer@gmail.com>
Date: Thu, 9 Dec 2021 17:33:31 +0100
Subject: [PATCH 12/15] Review updates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Thomas Grützmacher <thomas.gruetzmacher@kit.edu>
---
 cuda/solver/common_trs_kernels.cuh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 33bd6c2af6a..31f957b939e 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -477,10 +477,12 @@ __global__ void sptrsv_naive_legacy_kernel(
     auto j = row_begin;
     while (j != row_diag + row_step) {
         auto col = colidxs[j];
-        while (!is_nan(load(x, col * x_stride + rhs))) {
-            sum += vals[j] * load(x, col * x_stride + rhs);
+        auto x_val = load(x, col * x_stride + rhs);
+        while (!is_nan(x_val)) {
+            sum += vals[j] * x_val;
             j += row_step;
             col = colidxs[j];
+            x_val = load(x, col * x_stride + rhs);
         }
         if (row == col) {
             const auto r = (b[row * b_stride + rhs] - sum) / vals[row_diag];

From 97a2a1755649e58173c5838eb92675cacea8892e Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Fri, 10 Dec 2021 22:36:50 +0100
Subject: [PATCH 13/15] fix circular dependency

---
 cuda/solver/common_trs_kernels.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda/solver/common_trs_kernels.cuh b/cuda/solver/common_trs_kernels.cuh
index 31f957b939e..8c2cb010d6c 100644
--- a/cuda/solver/common_trs_kernels.cuh
+++ b/cuda/solver/common_trs_kernels.cuh
@@ -519,7 +519,7 @@ void sptrsv_naive_caching(std::shared_ptr<const CudaExecutor> exec,
     const auto nrhs = b->get_size()[1];
 
     // Initialize x to all NaNs.
-    x->fill(nan<ValueType>());
+    dense::fill(exec, x, nan<ValueType>());
 
     Array<bool> nan_produced(exec, 1);
     Array<IndexType> atomic_counter(exec, 1);

From 4a4bff942a9c20f8f373e91e7ba0caf2f7f6a95b Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Sat, 11 Dec 2021 13:11:24 +0100
Subject: [PATCH 14/15] avoid testing buggy cuSPARSE behavior

---
 cuda/test/solver/lower_trs_kernels.cpp | 6 ++++++
 cuda/test/solver/upper_trs_kernels.cpp | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cpp
index 514562f97fd..ec8d02a9a57 100644
--- a/cuda/test/solver/lower_trs_kernels.cpp
+++ b/cuda/test/solver/lower_trs_kernels.cpp
@@ -197,7 +197,13 @@ TEST_F(LowerTrs, CudaMultipleRhsApplyIsEquivalentToRef)
     auto d_solver = d_lower_trs_factory->generate(d_csr_mtx);
     auto db2_strided = Mtx::create(cuda, b->get_size(), 4);
     d_b2->convert_to(db2_strided.get());
+    // The cuSPARSE Generic SpSM implementation uses the wrong stride here
+    // so the input and output stride need to match
+#if CUDA_VERSION >= 11030
+    auto dx_strided = Mtx::create(cuda, x->get_size(), 4);
+#else
     auto dx_strided = Mtx::create(cuda, x->get_size(), 5);
+#endif
 
     solver->apply(b2.get(), x.get());
     d_solver->apply(db2_strided.get(), dx_strided.get());
diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp
index 13f6907141d..9e8918815f4 100644
--- a/cuda/test/solver/upper_trs_kernels.cpp
+++ b/cuda/test/solver/upper_trs_kernels.cpp
@@ -197,7 +197,13 @@ TEST_F(UpperTrs, CudaMultipleRhsApplyIsEquivalentToRef)
     auto d_solver = d_upper_trs_factory->generate(d_csr_mtx);
     auto db2_strided = Mtx::create(cuda, b->get_size(), 4);
     d_b2->convert_to(db2_strided.get());
+    // The cuSPARSE Generic SpSM implementation uses the wrong stride here
+    // so the input and output stride need to match
+#if CUDA_VERSION >= 11030
+    auto dx_strided = Mtx::create(cuda, x->get_size(), 4);
+#else
     auto dx_strided = Mtx::create(cuda, x->get_size(), 5);
+#endif
 
     solver->apply(b2.get(), x.get());
     d_solver->apply(db2_strided.get(), dx_strided.get());

From 3916a4b641b219110a9071e42d22c9a782aaf517 Mon Sep 17 00:00:00 2001
From: Lukas Riemer <lksriemer@gmail.com>
Date: Tue, 14 Dec 2021 10:33:38 +0100
Subject: [PATCH 15/15] complete sptrsv dpcpp stub

---
 dpcpp/solver/lower_trs_kernels.dp.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/dpcpp/solver/lower_trs_kernels.dp.cpp b/dpcpp/solver/lower_trs_kernels.dp.cpp
index ab372fc45fc..cede13f1c20 100644
--- a/dpcpp/solver/lower_trs_kernels.dp.cpp
+++ b/dpcpp/solver/lower_trs_kernels.dp.cpp
@@ -59,6 +59,13 @@ namespace dpcpp {
 namespace lower_trs {
 
 
+void should_perform_transpose(std::shared_ptr<const DpcppExecutor> exec,
+                              bool& do_transpose)
+{
+    do_transpose = false;
+}
+
+
 template <typename ValueType, typename IndexType>
 void generate(std::shared_ptr<const DpcppExecutor> exec,
               const matrix::Csr<ValueType, IndexType>* matrix,