From cf50b7d7d133ee8b975092706ae55b270c47f4d0 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Fri, 6 May 2022 17:00:03 +0400 Subject: [PATCH 1/3] Use CUB version of adjacent difference --- .../system/cuda/detail/adjacent_difference.h | 519 +++++------------- 1 file changed, 143 insertions(+), 376 deletions(-) diff --git a/thrust/system/cuda/detail/adjacent_difference.h b/thrust/system/cuda/detail/adjacent_difference.h index fb0ce49f1..e8a1940af 100644 --- a/thrust/system/cuda/detail/adjacent_difference.h +++ b/thrust/system/cuda/detail/adjacent_difference.h @@ -29,21 +29,20 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include - #include +#include #include -#include -#include -#include -#include -#include -#include +#include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include #include THRUST_NAMESPACE_BEGIN @@ -61,375 +60,121 @@ namespace cuda_cub { namespace __adjacent_difference { - namespace mpl = thrust::detail::mpl::math; - - template - struct PtxPolicy - { - enum - { - BLOCK_THREADS = _BLOCK_THREADS, - ITEMS_PER_THREAD = _ITEMS_PER_THREAD, - ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD - }; - - static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; - static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; - static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM; - }; - - template - struct items_per_thread - { - enum - { - value = (INPUT_SIZE <= 8) - ? NOMINAL_4B_ITEMS_PER_THREAD - : mpl::min< - int, - NOMINAL_4B_ITEMS_PER_THREAD, - mpl::max::value>::value - }; - }; - - template - struct Tuning; - - template - struct Tuning - { - enum - { - INPUT_SIZE = static_cast(sizeof(T)), - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = items_per_thread::value - }; - typedef PtxPolicy<128, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_DEFAULT, - cub::BLOCK_STORE_WARP_TRANSPOSE> - type; - }; - template - struct Tuning : Tuning - { - enum - { - NOMINAL_4B_ITEMS_PER_THREAD = 7, - ITEMS_PER_THREAD = items_per_thread::value - }; - typedef PtxPolicy<128, - ITEMS_PER_THREAD, - cub::BLOCK_LOAD_WARP_TRANSPOSE, - cub::LOAD_LDG, - cub::BLOCK_STORE_WARP_TRANSPOSE> - type; - }; - - template - struct AdjacentDifferenceAgent + cudaError_t THRUST_RUNTIME_FUNCTION + doit_step(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIt first, + OutputIt result, + BinaryOp binary_op, + std::size_t num_items, + cudaStream_t stream, + bool debug_sync) { - typedef typename iterator_traits::value_type input_type; - - // XXX output type must be result of BinaryOp(input_type,input_type); - typedef input_type output_type; - - template - struct PtxPlan : Tuning::type - { - typedef Tuning tuning; - - typedef typename core::LoadIterator::type LoadIt; - typedef typename core::BlockLoad::type BlockLoad; - - typedef typename core::BlockStore::type - BlockStore; - - typedef cub::BlockAdjacentDifference - BlockAdjacentDifference; - - union TempStorage - { - typename BlockAdjacentDifference::TempStorage discontinuity; - typename BlockLoad::TempStorage load; - typename BlockStore::TempStorage store; - }; // union TempStorage - }; // struct PtxPlan - - typedef typename core::specialize_plan_msvc10_war::type::type ptx_plan; - - typedef typename ptx_plan::LoadIt LoadIt; - typedef typename ptx_plan::BlockLoad BlockLoad; - typedef typename ptx_plan::BlockStore BlockStore; - typedef typename ptx_plan::BlockAdjacentDifference BlockAdjacentDifference; - typedef typename ptx_plan::TempStorage TempStorage; - - - enum + if (num_items == 0) { - ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD, - BLOCK_THREADS = ptx_plan::BLOCK_THREADS, - ITEMS_PER_TILE = ptx_plan::ITEMS_PER_TILE, - }; + return cudaSuccess; + } - struct impl - { + constexpr bool in_place = InPlace; + constexpr bool read_left = true; + + using Dispatch32 = cub::DispatchAdjacentDifference; + using Dispatch64 = cub::DispatchAdjacentDifference; - //--------------------------------------------------------------------- - // Per-thread fields - //--------------------------------------------------------------------- - - TempStorage &temp_storage; - LoadIt load_it; // iterator to the first element - input_type * first_tile_previous; // iterator to the first element of previous tile value - OutputIt output_it; - BinaryOp binary_op; - - template - void THRUST_DEVICE_FUNCTION - consume_tile_impl(int num_remaining, - int tile_idx, - Size tile_base) - { - input_type input[ITEMS_PER_THREAD]; - output_type output[ITEMS_PER_THREAD]; - - if (IS_LAST_TILE) - { - // Fill last elements with the first element - // because collectives are not suffix guarded - BlockLoad(temp_storage.load) - .Load(load_it + tile_base, - input, - num_remaining, - *(load_it + tile_base)); - } - else - { - BlockLoad(temp_storage.load).Load(load_it + tile_base, input); - } - - - core::sync_threadblock(); - - if (IS_FIRST_TILE) - { - BlockAdjacentDifference(temp_storage.discontinuity) - .SubtractLeft(input, output, binary_op); - if (threadIdx.x == 0) - output[0] = input[0]; - } - else - { - input_type tile_prev_input = first_tile_previous[tile_idx]; - BlockAdjacentDifference(temp_storage.discontinuity) - .SubtractLeft(input, output, binary_op, tile_prev_input); - } - - core::sync_threadblock(); - - if (IS_LAST_TILE) - { - BlockStore(temp_storage.store) - .Store(output_it + tile_base, output, num_remaining); - } - else - { - BlockStore(temp_storage.store).Store(output_it + tile_base, output); - } - } - - - template - void THRUST_DEVICE_FUNCTION - consume_tile(int num_remaining, - int tile_idx, - Size tile_base) - { - if (tile_idx == 0) - { - consume_tile_impl(num_remaining, - tile_idx, - tile_base); - } - else - { - consume_tile_impl(num_remaining, - tile_idx, - tile_base); - } - } - - void THRUST_DEVICE_FUNCTION - consume_range(Size num_items) - { - int tile_idx = blockIdx.x; - Size tile_base = static_cast(tile_idx) * ITEMS_PER_TILE; - Size num_remaining = num_items - tile_base; - - if (num_remaining > ITEMS_PER_TILE) // not a last tile - { - consume_tile(num_remaining, tile_idx, tile_base); - } - else if (num_remaining > 0) - { - consume_tile(num_remaining, tile_idx, tile_base); - } - } - - //--------------------------------------------------------------------- - // Constructor - //--------------------------------------------------------------------- - - THRUST_DEVICE_FUNCTION - impl(TempStorage &temp_storage_, - InputIt input_it_, - input_type * first_tile_previous_, - OutputIt result_, - BinaryOp binary_op_, - Size num_items) - : temp_storage(temp_storage_), - load_it(core::make_load_iterator(ptx_plan(), input_it_)), - first_tile_previous(first_tile_previous_), - output_it(result_), - binary_op(binary_op_) - { - consume_range(num_items); - } - }; // struct impl - - //--------------------------------------------------------------------- - // Agent entry point - //--------------------------------------------------------------------- - - THRUST_AGENT_ENTRY(InputIt first, - input_type *first_element, - OutputIt result, - BinaryOp binary_op, - Size num_items, - char * shmem) - { - TempStorage &storage = *reinterpret_cast(shmem); - impl(storage, first, first_element, result, binary_op, num_items); - } - }; // struct AdjacentDifferenceAgent + cudaError_t status; + THRUST_INDEX_TYPE_DISPATCH2(status, + Dispatch32::Dispatch, + Dispatch64::Dispatch, + num_items, + (d_temp_storage, + temp_storage_bytes, + first, + result, + num_items_fixed, + binary_op, + stream, + debug_sync)); + return status; + } template - struct InitAgent + class BinaryOp> + cudaError_t THRUST_RUNTIME_FUNCTION + doit_step(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIt first, + OutputIt result, + BinaryOp binary_op, + std::size_t num_items, + cudaStream_t stream, + bool debug_sync, + thrust::detail::integral_constant /* comparable */) { - template - struct PtxPlan : PtxPolicy<128> {}; - typedef core::specialize_plan ptx_plan; - - //--------------------------------------------------------------------- - // Agent entry point - //--------------------------------------------------------------------- - - THRUST_AGENT_ENTRY(InputIt first, - OutputIt result, - Size num_tiles, - int items_per_tile, - char * /*shmem*/) - { - int tile_idx = blockIdx.x * blockDim.x + threadIdx.x; - Size tile_base = static_cast(tile_idx) * items_per_tile; - if (tile_base > 0 && tile_idx < num_tiles) - result[tile_idx] = first[tile_base - 1]; - } - }; // struct InitAgent + constexpr bool in_place = true; + return doit_step(d_temp_storage, + temp_storage_bytes, + first, + result, + binary_op, + num_items, + stream, + debug_sync); + } template + class BinaryOp> cudaError_t THRUST_RUNTIME_FUNCTION - doit_step(void * d_temp_storage, - size_t & temp_storage_bytes, - InputIt first, - OutputIt result, - BinaryOp binary_op, - Size num_items, + doit_step(void *d_temp_storage, + size_t &temp_storage_bytes, + InputIt first, + OutputIt result, + BinaryOp binary_op, + std::size_t num_items, cudaStream_t stream, - bool debug_sync) + bool debug_sync, + thrust::detail::integral_constant /* comparable */) { - if (num_items == 0) - return cudaSuccess; - - using core::AgentPlan; - using core::AgentLauncher; - - cudaError_t status = cudaSuccess; - - typedef AgentLauncher< - AdjacentDifferenceAgent > - difference_agent; - - typedef typename iterator_traits::value_type input_type; - typedef AgentLauncher > init_agent; - - AgentPlan difference_plan = difference_agent::get_plan(stream); - AgentPlan init_plan = init_agent::get_plan(); - - - Size tile_size = difference_plan.items_per_tile; - Size num_tiles = cub::DivideAndRoundUp(num_items, tile_size); - - size_t tmp1 = num_tiles * sizeof(input_type); - size_t vshmem_size = core::vshmem_size(difference_plan.shared_memory_size, - num_tiles); - - size_t allocation_sizes[2] = {tmp1, vshmem_size}; - void * allocations[2] = {NULL, NULL}; - - status = core::alias_storage(d_temp_storage, - temp_storage_bytes, - allocations, - allocation_sizes); - CUDA_CUB_RET_IF_FAIL(status); - - if (d_temp_storage == NULL) + // The documentation states that pointers might be equal but can't alias in + // any other way. That is, the distance should be equal to zero or exceed + // `num_items`. In the latter case, we use an optimized version. + if (first != result) { - return status; + constexpr bool in_place = false; + return doit_step(d_temp_storage, + temp_storage_bytes, + first, + result, + binary_op, + num_items, + stream, + debug_sync); } - input_type *first_tile_previous = (input_type *)allocations[0]; - char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL; - - init_agent ia(init_plan, num_tiles, stream, "adjacent_difference::init_agent", debug_sync); - ia.launch(first, first_tile_previous, num_tiles, tile_size); - CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); - - difference_agent da(difference_plan, num_items, stream, vshmem_ptr, "adjacent_difference::difference_agent", debug_sync); - da.launch(first, - first_tile_previous, - result, - binary_op, - num_items); - CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError()); - return status; + constexpr bool in_place = true; + return doit_step(d_temp_storage, + temp_storage_bytes, + first, + result, + binary_op, + num_items, + stream, + debug_sync); } template ::difference_type size_type; - - size_type num_items = thrust::distance(first, last); - size_t storage_size = 0; - cudaStream_t stream = cuda_cub::stream(policy); - bool debug_sync = THRUST_DEBUG_SYNC_FLAG; - - cudaError_t status; - THRUST_INDEX_TYPE_DISPATCH(status, doit_step, num_items, - (NULL, storage_size, first, result, binary_op, - num_items_fixed, stream, debug_sync)); + const auto num_items = + static_cast(thrust::distance(first, last)); + std::size_t storage_size = 0; + cudaStream_t stream = cuda_cub::stream(policy); + const bool debug_sync = THRUST_DEBUG_SYNC_FLAG; + + using UnwrapInputIt = thrust::detail::try_unwrap_contiguous_iterator_return_t; + using UnwrapOutputIt = thrust::detail::try_unwrap_contiguous_iterator_return_t; + + constexpr bool can_compare_iterators = + is_contiguous_iterator::value && + is_contiguous_iterator::value && + thrust::detail::is_same::value; + + auto first_unwrap = thrust::detail::try_unwrap_contiguous_iterator(first); + auto result_unwrap = thrust::detail::try_unwrap_contiguous_iterator(result); + + thrust::detail::integral_constant comparable; + + cudaError_t status = doit_step(nullptr, + storage_size, + first_unwrap, + result_unwrap, + binary_op, + num_items, + stream, + debug_sync, + comparable); cuda_cub::throw_on_error(status, "adjacent_difference failed on 1st step"); // Allocate temporary storage. thrust::detail::temporary_array tmp(policy, storage_size); - void *ptr = static_cast(tmp.data().get()); - THRUST_INDEX_TYPE_DISPATCH(status, doit_step, num_items, - (ptr, storage_size, first, result, binary_op, - num_items_fixed, stream, debug_sync)); + status = doit_step(static_cast(tmp.data().get()), + storage_size, + first_unwrap, + result_unwrap, + binary_op, + num_items, + stream, + debug_sync, + comparable); cuda_cub::throw_on_error(status, "adjacent_difference failed on 2nd step"); status = cuda_cub::synchronize_optional(policy); From 0b49a6adfa952624e4652345006809cbaa18be4a Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Sun, 8 May 2022 17:42:46 +0400 Subject: [PATCH 2/3] Compare value types instead of iterator once --- testing/cuda/adjacent_difference.cu | 6 ++++++ thrust/system/cuda/detail/adjacent_difference.h | 9 ++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/testing/cuda/adjacent_difference.cu b/testing/cuda/adjacent_difference.cu index 96f3a5234..6f2927ebc 100644 --- a/testing/cuda/adjacent_difference.cu +++ b/testing/cuda/adjacent_difference.cu @@ -98,6 +98,12 @@ DECLARE_UNITTEST(TestAdjacentDifferenceCudaStreams); struct detect_wrong_difference { + using difference_type = void; + using value_type = void; + using pointer = void; + using reference = void; + using iterator_category = std::output_iterator_tag; + bool * flag; __host__ __device__ detect_wrong_difference operator++() const { return *this; } diff --git a/thrust/system/cuda/detail/adjacent_difference.h b/thrust/system/cuda/detail/adjacent_difference.h index e8a1940af..6539584ad 100644 --- a/thrust/system/cuda/detail/adjacent_difference.h +++ b/thrust/system/cuda/detail/adjacent_difference.h @@ -197,10 +197,13 @@ namespace __adjacent_difference { using UnwrapInputIt = thrust::detail::try_unwrap_contiguous_iterator_return_t; using UnwrapOutputIt = thrust::detail::try_unwrap_contiguous_iterator_return_t; + using InputValueT = thrust::iterator_value_t; + using OutputValueT = thrust::iterator_value_t; + constexpr bool can_compare_iterators = - is_contiguous_iterator::value && - is_contiguous_iterator::value && - thrust::detail::is_same::value; + std::is_pointer::value && + std::is_pointer::value && + std::is_same::value; auto first_unwrap = thrust::detail::try_unwrap_contiguous_iterator(first); auto result_unwrap = thrust::detail::try_unwrap_contiguous_iterator(result); From 37c05e75d45d912da7aae33a3747c1cad14da11f Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Sun, 8 May 2022 17:56:02 +0400 Subject: [PATCH 3/3] Better name for in-place execution --- .../system/cuda/detail/adjacent_difference.h | 62 +++++++++---------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/thrust/system/cuda/detail/adjacent_difference.h b/thrust/system/cuda/detail/adjacent_difference.h index 6539584ad..38f19fa66 100644 --- a/thrust/system/cuda/detail/adjacent_difference.h +++ b/thrust/system/cuda/detail/adjacent_difference.h @@ -60,7 +60,7 @@ namespace cuda_cub { namespace __adjacent_difference { - template @@ -79,20 +79,20 @@ namespace __adjacent_difference { return cudaSuccess; } - constexpr bool in_place = InPlace; + constexpr bool may_alias = MayAlias; constexpr bool read_left = true; using Dispatch32 = cub::DispatchAdjacentDifference; using Dispatch64 = cub::DispatchAdjacentDifference; cudaError_t status; @@ -125,15 +125,15 @@ namespace __adjacent_difference { bool debug_sync, thrust::detail::integral_constant /* comparable */) { - constexpr bool in_place = true; - return doit_step(d_temp_storage, - temp_storage_bytes, - first, - result, - binary_op, - num_items, - stream, - debug_sync); + constexpr bool may_alias = true; + return doit_step(d_temp_storage, + temp_storage_bytes, + first, + result, + binary_op, + num_items, + stream, + debug_sync); } template (d_temp_storage, - temp_storage_bytes, - first, - result, - binary_op, - num_items, - stream, - debug_sync); + constexpr bool may_alias = false; + return doit_step(d_temp_storage, + temp_storage_bytes, + first, + result, + binary_op, + num_items, + stream, + debug_sync); } - constexpr bool in_place = true; - return doit_step(d_temp_storage, - temp_storage_bytes, - first, - result, - binary_op, - num_items, - stream, - debug_sync); + constexpr bool may_alias = true; + return doit_step(d_temp_storage, + temp_storage_bytes, + first, + result, + binary_op, + num_items, + stream, + debug_sync); } template