From fd183c156b063c2c783e0a9d3501bce4a44f9aeb Mon Sep 17 00:00:00 2001 From: Dominique LaSalle Date: Wed, 17 Mar 2021 11:09:48 -0700 Subject: [PATCH 1/2] Wrap cub with prefixes and remove thrust --- src/array/cuda/array_cumsum.cu | 2 +- src/array/cuda/array_nonzero.cu | 59 +++++++++++++++++++----------- src/array/cuda/array_sort.cu | 2 +- src/array/cuda/csr_sort.cu | 2 +- src/array/cuda/dgl_cub.cuh | 17 +++++++++ src/array/cuda/utils.cu | 2 +- src/runtime/cuda/cuda_hashtable.cu | 2 +- 7 files changed, 60 insertions(+), 26 deletions(-) create mode 100644 src/array/cuda/dgl_cub.cuh diff --git a/src/array/cuda/array_cumsum.cu b/src/array/cuda/array_cumsum.cu index 0156c9e1664d..a63c017c2c68 100644 --- a/src/array/cuda/array_cumsum.cu +++ b/src/array/cuda/array_cumsum.cu @@ -4,9 +4,9 @@ * \brief Array cumsum GPU implementation */ #include -#include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" +#include "./dgl_cub.cuh" namespace dgl { using runtime::NDArray; diff --git a/src/array/cuda/array_nonzero.cu b/src/array/cuda/array_nonzero.cu index c4663f4f22da..1dadbf75b0ce 100644 --- a/src/array/cuda/array_nonzero.cu +++ b/src/array/cuda/array_nonzero.cu @@ -3,13 +3,11 @@ * \file array/cpu/array_nonzero.cc * \brief Array nonzero CPU implementation */ -#include -#include -#include -#include + #include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" +#include "./dgl_cub.cuh" namespace dgl { using runtime::NDArray; @@ -25,24 +23,43 @@ struct IsNonZero { template IdArray NonZero(IdArray array) { - auto* thr_entry = runtime::CUDAThreadEntry::ThreadLocal(); + const auto& ctx = array->ctx; + auto device = runtime::DeviceAPI::Get(ctx); + const int64_t len = array->shape[0]; - IdArray ret = NewIdArray(len, array->ctx, 64); - thrust::device_ptr in_data(array.Ptr()); - thrust::device_ptr out_data(ret.Ptr()); - // TODO(minjie): should take control of the memory allocator. - // See PyTorch's implementation here: - // https://github.com/pytorch/pytorch/blob/1f7557d173c8e9066ed9542ada8f4a09314a7e17/ - // aten/src/THC/generic/THCTensorMath.cu#L104 - auto startiter = thrust::make_counting_iterator(0); - auto enditer = startiter + len; - auto indices_end = thrust::copy_if(thrust::cuda::par.on(thr_entry->stream), - startiter, - enditer, - in_data, - out_data, - IsNonZero()); - const int64_t num_nonzeros = indices_end - out_data; + IdArray ret = NewIdArray(len, ctx, 64); + + cudaStream_t stream = 0; + + const IdType * const in_data = static_cast(array->data); + int64_t * const out_data = static_cast(ret->data); + + // room for cub to output on GPU + int64_t * d_num_nonzeros = static_cast( + device->AllocWorkspace(ctx, sizeof(int64_t))); + + size_t temp_size = 0; + cub::DeviceSelect::If(nullptr, temp_size, in_data, out_data, + d_num_nonzeros, len, IsNonZero()); + void * temp = device->AllocWorkspace(ctx, temp_size); + cub::DeviceSelect::If(temp, temp_size, in_data, out_data, + d_num_nonzeros, len, IsNonZero(), stream); + device->FreeWorkspace(ctx, temp); + + // copy number of selected elements from GPU to CPU + int64_t num_nonzeros; + device->CopyDataFromTo( + d_num_nonzeros, 0, + &num_nonzeros, 0, + sizeof(num_nonzeros), + ctx, + DGLContext{kDLCPU, 0}, + DGLType{kDLInt, 64, 1}, + stream); + device->FreeWorkspace(ctx, d_num_nonzeros); + device->StreamSync(ctx, stream); + + // truncate array to size return ret.CreateView({num_nonzeros}, ret->dtype, 0); } diff --git a/src/array/cuda/array_sort.cu b/src/array/cuda/array_sort.cu index f547ba378dc3..04cd513338f9 100644 --- a/src/array/cuda/array_sort.cu +++ b/src/array/cuda/array_sort.cu @@ -4,9 +4,9 @@ * \brief Array sort GPU implementation */ #include -#include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" +#include "./dgl_cub.cuh" namespace dgl { using runtime::NDArray; diff --git a/src/array/cuda/csr_sort.cu b/src/array/cuda/csr_sort.cu index add484f0adab..316c3147c473 100644 --- a/src/array/cuda/csr_sort.cu +++ b/src/array/cuda/csr_sort.cu @@ -4,9 +4,9 @@ * \brief Sort CSR index */ #include -#include #include "../../runtime/cuda/cuda_common.h" #include "./utils.h" +#include "./dgl_cub.cuh" namespace dgl { diff --git a/src/array/cuda/dgl_cub.cuh b/src/array/cuda/dgl_cub.cuh new file mode 100644 index 000000000000..17dad7dc48f9 --- /dev/null +++ b/src/array/cuda/dgl_cub.cuh @@ -0,0 +1,17 @@ +/*! + * Copyright (c) 2021 by Contributors + * \file cuda_common.h + * \brief Wrapper to place cub in dgl namespace. + */ + +#ifndef DGL_ARRAY_CUDA_DGL_CUB_CUH_ +#define DGL_ARRAY_CUDA_DGL_CUB_CUH_ + +// include cub in a safe manner +#define CUB_NS_PREFIX namespace dgl { +#define CUB_NS_POSTFIX } +#include "cub/cub.cuh" +#undef CUB_NS_POSTFIX +#undef CUB_NS_PREFIX + +#endif diff --git a/src/array/cuda/utils.cu b/src/array/cuda/utils.cu index f9e993bcd7ff..5fe4f976829c 100644 --- a/src/array/cuda/utils.cu +++ b/src/array/cuda/utils.cu @@ -5,7 +5,7 @@ */ #include "./utils.h" -#include +#include "./dgl_cub.cuh" #include "../../runtime/cuda/cuda_common.h" namespace dgl { diff --git a/src/runtime/cuda/cuda_hashtable.cu b/src/runtime/cuda/cuda_hashtable.cu index 896eb6f54de4..da4ae373b457 100644 --- a/src/runtime/cuda/cuda_hashtable.cu +++ b/src/runtime/cuda/cuda_hashtable.cu @@ -4,11 +4,11 @@ * \brief Device level functions for within cuda kernels. */ -#include #include #include "cuda_hashtable.cuh" #include "../../kernel/cuda/atomic.cuh" +#include "../../array/cuda/dgl_cub.cuh" using namespace dgl::kernel::cuda; From 35441f1f8af7a9a3b0cec72f4b43ab1ae4c15ab0 Mon Sep 17 00:00:00 2001 From: Dominique LaSalle Date: Wed, 17 Mar 2021 11:43:08 -0700 Subject: [PATCH 2/2] Using counting iterator --- src/array/cuda/array_nonzero.cu | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/array/cuda/array_nonzero.cu b/src/array/cuda/array_nonzero.cu index 1dadbf75b0ce..c7e7df151be0 100644 --- a/src/array/cuda/array_nonzero.cu +++ b/src/array/cuda/array_nonzero.cu @@ -15,10 +15,15 @@ namespace aten { namespace impl { template -struct IsNonZero { - __device__ bool operator() (const IdType val) { - return val != 0; +struct IsNonZeroIndex { + explicit IsNonZeroIndex(const IdType * array) : array_(array) { } + + __device__ bool operator() (const int64_t index) { + return array_[index] != 0; + } + + const IdType * array_; }; template @@ -34,16 +39,19 @@ IdArray NonZero(IdArray array) { const IdType * const in_data = static_cast(array->data); int64_t * const out_data = static_cast(ret->data); + IsNonZeroIndex comp(in_data); + cub::CountingInputIterator counter(0); + // room for cub to output on GPU int64_t * d_num_nonzeros = static_cast( device->AllocWorkspace(ctx, sizeof(int64_t))); size_t temp_size = 0; - cub::DeviceSelect::If(nullptr, temp_size, in_data, out_data, - d_num_nonzeros, len, IsNonZero()); + cub::DeviceSelect::If(nullptr, temp_size, counter, out_data, + d_num_nonzeros, len, comp, stream); void * temp = device->AllocWorkspace(ctx, temp_size); - cub::DeviceSelect::If(temp, temp_size, in_data, out_data, - d_num_nonzeros, len, IsNonZero(), stream); + cub::DeviceSelect::If(temp, temp_size, counter, out_data, + d_num_nonzeros, len, comp, stream); device->FreeWorkspace(ctx, temp); // copy number of selected elements from GPU to CPU