Merge modification of CUDA multiple RHS

Fixed / improved the following parts in the cuda `Csr` kernels: - `compute_items_per_thread` now supports various `IndexType`s - namespace renaming to make intentions more clear - using `zero_array` instead of `cuda_memset` - Adding comment to explain the `switch-case` - Added exception throwing when unsupported types/functionality is used Also, removed unused parameters in cuda `Hybrid` test. Fixes parts of PR: #262 PR for mentioned changes: #265
ginkgo-project · Mar 20, 2019 · 54cd5d4 · 54cd5d4
2 parents 01b351d + d37626e
commit 54cd5d4
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 28 deletions.
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
@@ -34,6 +34,9 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/matrix/csr_kernels.hpp"
 
 
+#include <algorithm>
+
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
@@ -47,6 +50,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/uninitialized_array.hpp"
+#include "cuda/components/zero_array.hpp"
 
 
 namespace gko {
@@ -549,7 +553,7 @@ __global__ __launch_bounds__(classical_block_size) void abstract_classical_spmv(
 }  // namespace kernel
 
 
-namespace {
+namespace host_kernel {
 
 
 template <int items_per_thread, typename ValueType, typename IndexType>
@@ -617,11 +621,14 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv);
 
 
-template <typename IndexType>
+template <typename ValueType, typename IndexType>
 int compute_items_per_thread(std::shared_ptr<const CudaExecutor> exec)
 {
     const int version = exec->get_major_version()
                         << 4 + exec->get_minor_version();
+    // The num_item is decided to make the occupancy 100%
+    // TODO: Extend this list when new GPU is released
+    //       Tune this parameter
     // 128 threads/block the number of items per threads
     // 3.0 3.5: 6
     // 3.7: 14
@@ -643,14 +650,17 @@ int compute_items_per_thread(std::shared_ptr<const CudaExecutor> exec)
     case 0x37:
         num_item = 14;
     }
-    // The calculation is based on size(IndexType) = 4
-    constexpr int index_scale = sizeof(IndexType) / 4;
-    int items_per_thread = num_item / index_scale;
-    return items_per_thread;
+    // Ensure that satisfy:
+    // sizeof(IndexType) + sizeof(ValueType)
+    // <= items_per_thread * sizeof(IndexType)
+    constexpr int minimal_num =
+        ceildiv(sizeof(IndexType) + sizeof(ValueType), sizeof(IndexType));
+    int items_per_thread = num_item * 4 / sizeof(IndexType);
+    return std::max(minimal_num, items_per_thread);
 }
 
 
-}  // namespace
+}  // namespace host_kernel
 
 
 template <typename ValueType, typename IndexType>
@@ -659,9 +669,7 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
           const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
 {
     if (a->get_strategy()->get_name() == "load_balance") {
-        GKO_ASSERT_NO_CUDA_ERRORS(
-            cudaMemset(c->get_values(), 0,
-                       c->get_num_stored_elements() * sizeof(ValueType)));
+        zero_array(c->get_num_stored_elements(), c->get_values());
         const IndexType nwarps = a->get_num_srow_elements();
         if (nwarps > 0) {
             const dim3 csr_block(cuda_config::warp_size, warps_in_block, 1);
@@ -677,13 +685,14 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
                 as_cuda_type(c->get_stride()));
         }
     } else if (a->get_strategy()->get_name() == "merge_path") {
-        int items_per_thread = compute_items_per_thread<IndexType>(exec);
-        select_merge_path_spmv(compiled_kernels(),
-                               [&items_per_thread](int compiled_info) {
-                                   return items_per_thread == compiled_info;
-                               },
-                               syn::value_list<int>(), syn::type_list<>(), exec,
-                               a, b, c);
+        int items_per_thread =
+            host_kernel::compute_items_per_thread<ValueType, IndexType>(exec);
+        host_kernel::select_merge_path_spmv(
+            compiled_kernels(),
+            [&items_per_thread](int compiled_info) {
+                return items_per_thread == compiled_info;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
     } else if (a->get_strategy()->get_name() == "classical") {
         const dim3 grid(ceildiv(a->get_size()[0], classical_block_size),
                         b->get_size()[1]);
@@ -717,7 +726,11 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
                 cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE));
 
             cusparse::destroy(descr);
+        } else {
+            GKO_NOT_IMPLEMENTED;
         }
+    } else {
+        GKO_NOT_IMPLEMENTED;
     }
 }
 
@@ -785,13 +798,15 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
             as_cuda_type(beta->get_const_values()),
             as_cuda_type(c->get_values()), c->get_stride());
     } else if (a->get_strategy()->get_name() == "merge_path") {
-        int items_per_thread = compute_items_per_thread<IndexType>(exec);
-        select_merge_path_spmv(compiled_kernels(),
-                               [&items_per_thread](int compiled_info) {
-                                   return items_per_thread == compiled_info;
-                               },
-                               syn::value_list<int>(), syn::type_list<>(), exec,
-                               a, b, c, alpha, beta);
+        int items_per_thread =
+            host_kernel::compute_items_per_thread<ValueType, IndexType>(exec);
+        host_kernel::select_merge_path_spmv(
+            compiled_kernels(),
+            [&items_per_thread](int compiled_info) {
+                return items_per_thread == compiled_info;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha,
+            beta);
     } else {
         GKO_NOT_IMPLEMENTED;
     }

diff --git a/cuda/test/matrix/hybrid_kernels.cpp b/cuda/test/matrix/hybrid_kernels.cpp
@@ -79,8 +79,7 @@ class Hybrid : public ::testing::Test {
             std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
     }
 
-    void set_up_apply_data(int num_stored_elements_per_row = 0, int stride = 0,
-                           int num_vectors = 1)
+    void set_up_apply_data(int num_vectors = 1)
     {
         mtx = Mtx::create(ref);
         mtx->copy_from(gen_mtx(532, 231, 1));
@@ -157,7 +156,7 @@ TEST_F(Hybrid, AdvancedApplyIsEquivalentToRef)
 
 TEST_F(Hybrid, SimpleApplyToDenseMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(0, 0, 3);
+    set_up_apply_data(3);
 
     mtx->apply(y.get(), expected.get());
     dmtx->apply(dy.get(), dresult.get());
@@ -168,7 +167,7 @@ TEST_F(Hybrid, SimpleApplyToDenseMatrixIsEquivalentToRef)
 
 TEST_F(Hybrid, AdvancedApplyToDenseMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(0, 0, 3);
+    set_up_apply_data(3);
 
     mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
     dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());