Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Update CDP support macros for if-target compatibility #1661

Merged
merged 3 commits into from
Jun 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dependencies/cub
Submodule cub updated 38 files
+100 −0 cub/detail/detect_cuda_runtime.cuh
+1 −7 cub/detail/device_synchronize.cuh
+2 −2 cub/detail/type_traits.cuh
+7 −10 cub/device/dispatch/dispatch_adjacent_difference.cuh
+12 −15 cub/device/dispatch/dispatch_histogram.cuh
+9 −15 cub/device/dispatch/dispatch_merge_sort.cuh
+58 −57 cub/device/dispatch/dispatch_radix_sort.cuh
+36 −35 cub/device/dispatch/dispatch_reduce.cuh
+20 −29 cub/device/dispatch/dispatch_reduce_by_key.cuh
+20 −15 cub/device/dispatch/dispatch_rle.cuh
+18 −17 cub/device/dispatch/dispatch_scan.cuh
+18 −15 cub/device/dispatch/dispatch_scan_by_key.cuh
+56 −58 cub/device/dispatch/dispatch_segmented_sort.cuh
+18 −28 cub/device/dispatch/dispatch_select_if.cuh
+24 −13 cub/device/dispatch/dispatch_spmv_orig.cuh
+6 −10 cub/device/dispatch/dispatch_three_way_partition.cuh
+15 −16 cub/device/dispatch/dispatch_unique_by_key.cuh
+6 −13 cub/util_arch.cuh
+3 −5 cub/util_debug.cuh
+39 −44 cub/util_device.cuh
+1 −1 examples/device/example_device_reduce.cu
+50 −12 test/CMakeLists.txt
+21 −9 test/README.md
+33 −0 test/cmake/check_source_files.cmake
+0 −2 test/test_allocator.cu
+34 −0 test/test_cdp_variant_state.cu
+0 −71 test/test_device_histogram.cu
+326 −150 test/test_device_radix_sort.cu
+282 −172 test/test_device_reduce.cu
+125 −83 test/test_device_reduce_by_key.cu
+127 −87 test/test_device_run_length_encode.cu
+77 −65 test/test_device_scan.cu
+124 −106 test/test_device_scan_by_key.cu
+10 −6 test/test_device_segmented_sort.cu
+122 −78 test/test_device_select_if.cu
+96 −68 test/test_device_select_unique.cu
+110 −73 test/test_device_select_unique_by_key.cu
+17 −12 test/test_util.h
33 changes: 33 additions & 0 deletions testing/cmake/check_source_files.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,24 @@ if (NOT valid_count EQUAL 5)
"Matched ${valid_count} times, expected 5.")
endif()

################################################################################
# Legacy macro checks.
# Check all files in Thrust to make sure that they aren't using the legacy
# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros.
#
# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET.
# They are provided for legacy purposes and should be replaced with
# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code.
#
#
set(legacy_macro_header_exclusions
# This header defines a legacy CUDART macro:
thrust/system/cuda/config.h
)

set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED")
set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__")

################################################################################
# Read source files:
foreach(src ${thrust_srcs})
Expand Down Expand Up @@ -145,6 +163,21 @@ foreach(src ${thrust_srcs})
set(found_errors 1)
endif()
endif()

if (NOT ${src} IN_LIST legacy_macro_header_exclusions)
count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count)
count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count)

if (NOT thrust_count EQUAL 0)
message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.")
set(found_errors 1)
endif()

if (NOT cub_count EQUAL 0)
message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.")
set(found_errors 1)
endif()
endif()
endforeach()

if (NOT found_errors EQUAL 0)
Expand Down
16 changes: 10 additions & 6 deletions testing/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ file(GLOB test_srcs

# These tests always build with RDC, so make sure that the sm_XX flags are
# compatible. See note in ThrustCudaConfig.cmake.
# TODO once we're using CUDA_ARCHITECTURES, we can setup non-rdc fallback
# tests to build for non-rdc arches. But for now, all files in a given directory
# must build with the same `CMAKE_CUDA_FLAGS` due to CMake constraints around
# how CUDA_FLAGS works.
set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}")

foreach(thrust_target IN LISTS THRUST_TARGETS)
Expand All @@ -18,11 +22,11 @@ foreach(thrust_target IN LISTS THRUST_TARGETS)
get_filename_component(test_name "${test_src}" NAME_WLE)
string(PREPEND test_name "cuda.")

thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})

# All in testing/cuda will test device-side launch (aka calling parallel
# algorithms from device code), which requires the CUDA device-side runtime,
# which requires RDC, so these always need to be built with RDC.
thrust_enable_rdc_for_cuda_target(${test_target})
# Create two targets, one with RDC enabled, the other without. This tests
# both device-side behaviors -- the CDP kernel launch with RDC, and the
# serial fallback path without RDC.
thrust_add_test(seq_test_target ${test_name}.cdp_0 "${test_src}" ${thrust_target})
thrust_add_test(cdp_test_target ${test_name}.cdp_1 "${test_src}" ${thrust_target})
thrust_enable_rdc_for_cuda_target(${cdp_test_target})
endforeach()
endforeach()
45 changes: 35 additions & 10 deletions thrust/system/cuda/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,22 +32,47 @@
// older releases. This header will always pull in version info:
#include <cub/util_namespace.cuh>

#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
# if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
# define __THRUST_HAS_CUDART__ 1
# define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__
# else
# define __THRUST_HAS_CUDART__ 0
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
# endif
#include <cub/detail/detect_cuda_runtime.cuh>

/**
* \def THRUST_RUNTIME_FUNCTION
*
* Execution space for functions that can use the CUDA runtime API (`__host__`
* when RDC is off, `__host__ __device__` when RDC is on).
*/
#define THRUST_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION

/**
* \def THRUST_RDC_ENABLED
*
* Defined if RDC is enabled.
*/
#ifdef CUB_RDC_ENABLED
#define THRUST_RDC_ENABLED
#endif

/**
* \def __THRUST_HAS_CUDART__
*
* Whether or not the active compiler pass is allowed to invoke device kernels
* or methods from the CUDA runtime API.
*
* This macro should not be used in Thrust, as it depends on `__CUDA_ARCH__`
* and is not compatible with `NV_IF_TARGET`. It is provided for legacy
* purposes only.
*
* Replace any usages with `THRUST_RDC_ENABLED` and `NV_IF_TARGET`.
*/
#ifdef CUB_RUNTIME_ENABLED
#define __THRUST_HAS_CUDART__ 1
#else
# define __THRUST_HAS_CUDART__ 0
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
#define __THRUST_HAS_CUDART__ 0
#endif

// These definitions were intended for internal use only and are now obsolete.
// If you relied on them, consider porting your code to use the functionality
// in libcu++'s <nv/target> header.
//
// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make
// them available again. These should be considered deprecated and will be
// fully removed in a future version.
Expand Down
35 changes: 14 additions & 21 deletions thrust/system/cuda/detail/adjacent_difference.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@
#include <thrust/detail/config.h>

#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC

#include <thrust/detail/cstdint.h>
#include <thrust/detail/minmax.h>
#include <thrust/detail/temporary_array.h>
#include <thrust/detail/type_traits.h>
#include <thrust/functional.h>
#include <thrust/system/cuda/config.h>
#include <thrust/system/cuda/detail/cdp_dispatch.h>
#include <thrust/system/cuda/detail/dispatch.h>
#include <thrust/system/cuda/detail/par_to_seq.h>
#include <thrust/system/cuda/detail/util.h>
Expand Down Expand Up @@ -260,27 +262,18 @@ adjacent_difference(execution_policy<Derived> &policy,
OutputIt result,
BinaryOp binary_op)
{
OutputIt ret = result;
if (__THRUST_HAS_CUDART__)
{
ret = __adjacent_difference::adjacent_difference(policy,
first,
last,
result,
binary_op);
}
else
{
#if !__THRUST_HAS_CUDART__
ret = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
first,
last,
result,
binary_op);
#endif
}

return ret;
THRUST_CDP_DISPATCH(
(result = __adjacent_difference::adjacent_difference(policy,
first,
last,
result,
binary_op);),
(result = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
first,
last,
result,
binary_op);));
return result;
}

template <class Derived,
Expand Down
72 changes: 72 additions & 0 deletions thrust/system/cuda/detail/cdp_dispatch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/**
* \file
* Utilities for CUDA dynamic parallelism.
*/

#pragma once

#include <cub/config.cuh>
#include <cub/detail/detect_cuda_runtime.cuh>

#include <nv/target>

/**
* \def THRUST_CDP_DISPATCH
*
* If CUDA Dynamic Parallelism / CUDA Nested Parallelism is available, always
* run the parallel implementation. Otherwise, run the parallel implementation
* when called from the host, and fallback to the sequential implementation on
* the device.
*
* `par_impl` and `seq_impl` are blocks of C++ statements enclosed in
* parentheses, similar to NV_IF_TARGET blocks:
*
* \code
* THRUST_CDP_DISPATCH((launch_parallel_kernel();), (run_serial_impl();));
* \endcode
*/

#ifdef THRUST_RDC_ENABLED

// seq_impl unused.
#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
NV_IF_TARGET(NV_ANY_TARGET, par_impl)

#else // THRUST_RDC_ENABLED

// Special case for NVCC -- need to inform the device path about the kernels
// that are launched from the host path.
#if defined(__CUDACC__) && defined(__CUDA_ARCH__)

// Device-side launch not supported, fallback to sequential in device code.
#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
if (false) \
{ /* Without this, the device pass won't compile any kernels. */ \
NV_IF_TARGET(NV_ANY_TARGET, par_impl); \
} \
NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)

#else // NVCC device pass

#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)

#endif // NVCC device pass

#endif // THRUST_RDC_ENABLED
45 changes: 16 additions & 29 deletions thrust/system/cuda/detail/copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@

#include <thrust/detail/config.h>

#include <thrust/advance.h>

#include <thrust/system/cuda/config.h>
#include <thrust/system/cuda/detail/cdp_dispatch.h>
#include <thrust/system/cuda/detail/execution_policy.h>
#include <thrust/system/cuda/detail/cross_system.h>

Expand Down Expand Up @@ -117,22 +120,11 @@ copy(execution_policy<System> &system,
InputIterator last,
OutputIterator result)
{
OutputIterator ret = result;
if (__THRUST_HAS_CUDART__)
{
ret = __copy::device_to_device(system, first, last, result);
}
else
{
#if !__THRUST_HAS_CUDART__
ret = thrust::copy(cvt_to_seq(derived_cast(system)),
first,
last,
result);
#endif
}

return ret;
THRUST_CDP_DISPATCH(
(result = __copy::device_to_device(system, first, last, result);),
(result =
thrust::copy(cvt_to_seq(derived_cast(system)), first, last, result);));
return result;
} // end copy()

__thrust_exec_check_disable__
Expand All @@ -146,19 +138,14 @@ copy_n(execution_policy<System> &system,
Size n,
OutputIterator result)
{
OutputIterator ret = result;
if (__THRUST_HAS_CUDART__)
{
ret = __copy::device_to_device(system, first, first + n, result);
}
else
{
#if !__THRUST_HAS_CUDART__
ret = thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);
#endif
}

return ret;
THRUST_CDP_DISPATCH(
(result = __copy::device_to_device(system,
first,
thrust::next(first, n),
result);),
(result =
thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);));
return result;
} // end copy_n()
#endif

Expand Down
Loading