Skip to content

Commit

Permalink
Merge modification of CUDA multiple RHS
Browse files Browse the repository at this point in the history
Fixed / improved the following parts in the cuda `Csr` kernels:
- `compute_items_per_thread` now supports various `IndexType`s
- namespace renaming to make intentions more clear
- using `zero_array` instead of `cuda_memset`
- Adding comment to explain the `switch-case`
- Added exception throwing when unsupported types/functionality is used

Also, removed unused parameters in cuda `Hybrid` test.


Fixes parts of PR: #262
PR for mentioned changes:
#265
  • Loading branch information
Thomas Grützmacher authored Mar 20, 2019
2 parents 01b351d + d37626e commit 54cd5d4
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 28 deletions.
63 changes: 39 additions & 24 deletions cuda/matrix/csr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "core/matrix/csr_kernels.hpp"


#include <algorithm>


#include <ginkgo/core/base/exception_helpers.hpp>
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/matrix/dense.hpp>
Expand All @@ -47,6 +50,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cuda/components/atomic.cuh"
#include "cuda/components/cooperative_groups.cuh"
#include "cuda/components/uninitialized_array.hpp"
#include "cuda/components/zero_array.hpp"


namespace gko {
Expand Down Expand Up @@ -549,7 +553,7 @@ __global__ __launch_bounds__(classical_block_size) void abstract_classical_spmv(
} // namespace kernel


namespace {
namespace host_kernel {


template <int items_per_thread, typename ValueType, typename IndexType>
Expand Down Expand Up @@ -617,11 +621,14 @@ void merge_path_spmv(syn::value_list<int, items_per_thread>,
GKO_ENABLE_IMPLEMENTATION_SELECTION(select_merge_path_spmv, merge_path_spmv);


template <typename IndexType>
template <typename ValueType, typename IndexType>
int compute_items_per_thread(std::shared_ptr<const CudaExecutor> exec)
{
const int version = exec->get_major_version()
<< 4 + exec->get_minor_version();
// The num_item is decided to make the occupancy 100%
// TODO: Extend this list when new GPU is released
// Tune this parameter
// 128 threads/block the number of items per threads
// 3.0 3.5: 6
// 3.7: 14
Expand All @@ -643,14 +650,17 @@ int compute_items_per_thread(std::shared_ptr<const CudaExecutor> exec)
case 0x37:
num_item = 14;
}
// The calculation is based on size(IndexType) = 4
constexpr int index_scale = sizeof(IndexType) / 4;
int items_per_thread = num_item / index_scale;
return items_per_thread;
// Ensure that satisfy:
// sizeof(IndexType) + sizeof(ValueType)
// <= items_per_thread * sizeof(IndexType)
constexpr int minimal_num =
ceildiv(sizeof(IndexType) + sizeof(ValueType), sizeof(IndexType));
int items_per_thread = num_item * 4 / sizeof(IndexType);
return std::max(minimal_num, items_per_thread);
}


} // namespace
} // namespace host_kernel


template <typename ValueType, typename IndexType>
Expand All @@ -659,9 +669,7 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
const matrix::Dense<ValueType> *b, matrix::Dense<ValueType> *c)
{
if (a->get_strategy()->get_name() == "load_balance") {
GKO_ASSERT_NO_CUDA_ERRORS(
cudaMemset(c->get_values(), 0,
c->get_num_stored_elements() * sizeof(ValueType)));
zero_array(c->get_num_stored_elements(), c->get_values());
const IndexType nwarps = a->get_num_srow_elements();
if (nwarps > 0) {
const dim3 csr_block(cuda_config::warp_size, warps_in_block, 1);
Expand All @@ -677,13 +685,14 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
as_cuda_type(c->get_stride()));
}
} else if (a->get_strategy()->get_name() == "merge_path") {
int items_per_thread = compute_items_per_thread<IndexType>(exec);
select_merge_path_spmv(compiled_kernels(),
[&items_per_thread](int compiled_info) {
return items_per_thread == compiled_info;
},
syn::value_list<int>(), syn::type_list<>(), exec,
a, b, c);
int items_per_thread =
host_kernel::compute_items_per_thread<ValueType, IndexType>(exec);
host_kernel::select_merge_path_spmv(
compiled_kernels(),
[&items_per_thread](int compiled_info) {
return items_per_thread == compiled_info;
},
syn::value_list<int>(), syn::type_list<>(), exec, a, b, c);
} else if (a->get_strategy()->get_name() == "classical") {
const dim3 grid(ceildiv(a->get_size()[0], classical_block_size),
b->get_size()[1]);
Expand Down Expand Up @@ -717,7 +726,11 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE));

cusparse::destroy(descr);
} else {
GKO_NOT_IMPLEMENTED;
}
} else {
GKO_NOT_IMPLEMENTED;
}
}

Expand Down Expand Up @@ -785,13 +798,15 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
as_cuda_type(beta->get_const_values()),
as_cuda_type(c->get_values()), c->get_stride());
} else if (a->get_strategy()->get_name() == "merge_path") {
int items_per_thread = compute_items_per_thread<IndexType>(exec);
select_merge_path_spmv(compiled_kernels(),
[&items_per_thread](int compiled_info) {
return items_per_thread == compiled_info;
},
syn::value_list<int>(), syn::type_list<>(), exec,
a, b, c, alpha, beta);
int items_per_thread =
host_kernel::compute_items_per_thread<ValueType, IndexType>(exec);
host_kernel::select_merge_path_spmv(
compiled_kernels(),
[&items_per_thread](int compiled_info) {
return items_per_thread == compiled_info;
},
syn::value_list<int>(), syn::type_list<>(), exec, a, b, c, alpha,
beta);
} else {
GKO_NOT_IMPLEMENTED;
}
Expand Down
7 changes: 3 additions & 4 deletions cuda/test/matrix/hybrid_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,7 @@ class Hybrid : public ::testing::Test {
std::normal_distribution<>(-1.0, 1.0), rand_engine, ref);
}

void set_up_apply_data(int num_stored_elements_per_row = 0, int stride = 0,
int num_vectors = 1)
void set_up_apply_data(int num_vectors = 1)
{
mtx = Mtx::create(ref);
mtx->copy_from(gen_mtx(532, 231, 1));
Expand Down Expand Up @@ -157,7 +156,7 @@ TEST_F(Hybrid, AdvancedApplyIsEquivalentToRef)

TEST_F(Hybrid, SimpleApplyToDenseMatrixIsEquivalentToRef)
{
set_up_apply_data(0, 0, 3);
set_up_apply_data(3);

mtx->apply(y.get(), expected.get());
dmtx->apply(dy.get(), dresult.get());
Expand All @@ -168,7 +167,7 @@ TEST_F(Hybrid, SimpleApplyToDenseMatrixIsEquivalentToRef)

TEST_F(Hybrid, AdvancedApplyToDenseMatrixIsEquivalentToRef)
{
set_up_apply_data(0, 0, 3);
set_up_apply_data(3);

mtx->apply(alpha.get(), y.get(), beta.get(), expected.get());
dmtx->apply(dalpha.get(), dy.get(), dbeta.get(), dresult.get());
Expand Down

0 comments on commit 54cd5d4

Please sign in to comment.