Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Merge pull request #1661 from allisonvacanti/if_target_cdp
Browse files Browse the repository at this point in the history
Update CDP support macros for if-target compatibility
  • Loading branch information
alliepiper authored Jun 29, 2022
2 parents f302e6a + cb30a6b commit 967924e
Show file tree
Hide file tree
Showing 25 changed files with 952 additions and 1,917 deletions.
2 changes: 1 addition & 1 deletion dependencies/cub
Submodule cub updated 38 files
+100 −0 cub/detail/detect_cuda_runtime.cuh
+1 −7 cub/detail/device_synchronize.cuh
+2 −2 cub/detail/type_traits.cuh
+7 −10 cub/device/dispatch/dispatch_adjacent_difference.cuh
+12 −15 cub/device/dispatch/dispatch_histogram.cuh
+9 −15 cub/device/dispatch/dispatch_merge_sort.cuh
+58 −57 cub/device/dispatch/dispatch_radix_sort.cuh
+36 −35 cub/device/dispatch/dispatch_reduce.cuh
+20 −29 cub/device/dispatch/dispatch_reduce_by_key.cuh
+20 −15 cub/device/dispatch/dispatch_rle.cuh
+18 −17 cub/device/dispatch/dispatch_scan.cuh
+18 −15 cub/device/dispatch/dispatch_scan_by_key.cuh
+56 −58 cub/device/dispatch/dispatch_segmented_sort.cuh
+18 −28 cub/device/dispatch/dispatch_select_if.cuh
+24 −13 cub/device/dispatch/dispatch_spmv_orig.cuh
+6 −10 cub/device/dispatch/dispatch_three_way_partition.cuh
+15 −16 cub/device/dispatch/dispatch_unique_by_key.cuh
+6 −13 cub/util_arch.cuh
+3 −5 cub/util_debug.cuh
+39 −44 cub/util_device.cuh
+1 −1 examples/device/example_device_reduce.cu
+50 −12 test/CMakeLists.txt
+21 −9 test/README.md
+33 −0 test/cmake/check_source_files.cmake
+0 −2 test/test_allocator.cu
+34 −0 test/test_cdp_variant_state.cu
+0 −71 test/test_device_histogram.cu
+326 −150 test/test_device_radix_sort.cu
+282 −172 test/test_device_reduce.cu
+125 −83 test/test_device_reduce_by_key.cu
+127 −87 test/test_device_run_length_encode.cu
+77 −65 test/test_device_scan.cu
+124 −106 test/test_device_scan_by_key.cu
+10 −6 test/test_device_segmented_sort.cu
+122 −78 test/test_device_select_if.cu
+96 −68 test/test_device_select_unique.cu
+110 −73 test/test_device_select_unique_by_key.cu
+17 −12 test/test_util.h
33 changes: 33 additions & 0 deletions testing/cmake/check_source_files.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,24 @@ if (NOT valid_count EQUAL 5)
"Matched ${valid_count} times, expected 5.")
endif()

################################################################################
# Legacy macro checks.
# Check all files in Thrust to make sure that they aren't using the legacy
# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros.
#
# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET.
# They are provided for legacy purposes and should be replaced with
# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code.
#
#
set(legacy_macro_header_exclusions
# This header defines a legacy CUDART macro:
thrust/system/cuda/config.h
)

set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED")
set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__")

################################################################################
# Read source files:
foreach(src ${thrust_srcs})
Expand Down Expand Up @@ -145,6 +163,21 @@ foreach(src ${thrust_srcs})
set(found_errors 1)
endif()
endif()

if (NOT ${src} IN_LIST legacy_macro_header_exclusions)
count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count)
count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count)

if (NOT thrust_count EQUAL 0)
message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.")
set(found_errors 1)
endif()

if (NOT cub_count EQUAL 0)
message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.")
set(found_errors 1)
endif()
endif()
endforeach()

if (NOT found_errors EQUAL 0)
Expand Down
16 changes: 10 additions & 6 deletions testing/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ file(GLOB test_srcs

# These tests always build with RDC, so make sure that the sm_XX flags are
# compatible. See note in ThrustCudaConfig.cmake.
# TODO once we're using CUDA_ARCHITECTURES, we can setup non-rdc fallback
# tests to build for non-rdc arches. But for now, all files in a given directory
# must build with the same `CMAKE_CUDA_FLAGS` due to CMake constraints around
# how CUDA_FLAGS works.
set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}")

foreach(thrust_target IN LISTS THRUST_TARGETS)
Expand All @@ -18,11 +22,11 @@ foreach(thrust_target IN LISTS THRUST_TARGETS)
get_filename_component(test_name "${test_src}" NAME_WLE)
string(PREPEND test_name "cuda.")

thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})

# All in testing/cuda will test device-side launch (aka calling parallel
# algorithms from device code), which requires the CUDA device-side runtime,
# which requires RDC, so these always need to be built with RDC.
thrust_enable_rdc_for_cuda_target(${test_target})
# Create two targets, one with RDC enabled, the other without. This tests
# both device-side behaviors -- the CDP kernel launch with RDC, and the
# serial fallback path without RDC.
thrust_add_test(seq_test_target ${test_name}.cdp_0 "${test_src}" ${thrust_target})
thrust_add_test(cdp_test_target ${test_name}.cdp_1 "${test_src}" ${thrust_target})
thrust_enable_rdc_for_cuda_target(${cdp_test_target})
endforeach()
endforeach()
45 changes: 35 additions & 10 deletions thrust/system/cuda/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,22 +32,47 @@
// older releases. This header will always pull in version info:
#include <cub/util_namespace.cuh>

#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
# if !defined(__CUDA_ARCH__) || defined(__CUDACC_RDC__)
# define __THRUST_HAS_CUDART__ 1
# define THRUST_RUNTIME_FUNCTION __host__ __device__ __forceinline__
# else
# define __THRUST_HAS_CUDART__ 0
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
# endif
#include <cub/detail/detect_cuda_runtime.cuh>

/**
* \def THRUST_RUNTIME_FUNCTION
*
* Execution space for functions that can use the CUDA runtime API (`__host__`
* when RDC is off, `__host__ __device__` when RDC is on).
*/
#define THRUST_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION

/**
* \def THRUST_RDC_ENABLED
*
* Defined if RDC is enabled.
*/
#ifdef CUB_RDC_ENABLED
#define THRUST_RDC_ENABLED
#endif

/**
* \def __THRUST_HAS_CUDART__
*
* Whether or not the active compiler pass is allowed to invoke device kernels
* or methods from the CUDA runtime API.
*
* This macro should not be used in Thrust, as it depends on `__CUDA_ARCH__`
* and is not compatible with `NV_IF_TARGET`. It is provided for legacy
* purposes only.
*
* Replace any usages with `THRUST_RDC_ENABLED` and `NV_IF_TARGET`.
*/
#ifdef CUB_RUNTIME_ENABLED
#define __THRUST_HAS_CUDART__ 1
#else
# define __THRUST_HAS_CUDART__ 0
# define THRUST_RUNTIME_FUNCTION __host__ __forceinline__
#define __THRUST_HAS_CUDART__ 0
#endif

// These definitions were intended for internal use only and are now obsolete.
// If you relied on them, consider porting your code to use the functionality
// in libcu++'s <nv/target> header.
//
// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make
// them available again. These should be considered deprecated and will be
// fully removed in a future version.
Expand Down
35 changes: 14 additions & 21 deletions thrust/system/cuda/detail/adjacent_difference.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@
#include <thrust/detail/config.h>

#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC

#include <thrust/detail/cstdint.h>
#include <thrust/detail/minmax.h>
#include <thrust/detail/temporary_array.h>
#include <thrust/detail/type_traits.h>
#include <thrust/functional.h>
#include <thrust/system/cuda/config.h>
#include <thrust/system/cuda/detail/cdp_dispatch.h>
#include <thrust/system/cuda/detail/dispatch.h>
#include <thrust/system/cuda/detail/par_to_seq.h>
#include <thrust/system/cuda/detail/util.h>
Expand Down Expand Up @@ -260,27 +262,18 @@ adjacent_difference(execution_policy<Derived> &policy,
OutputIt result,
BinaryOp binary_op)
{
OutputIt ret = result;
if (__THRUST_HAS_CUDART__)
{
ret = __adjacent_difference::adjacent_difference(policy,
first,
last,
result,
binary_op);
}
else
{
#if !__THRUST_HAS_CUDART__
ret = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
first,
last,
result,
binary_op);
#endif
}

return ret;
THRUST_CDP_DISPATCH(
(result = __adjacent_difference::adjacent_difference(policy,
first,
last,
result,
binary_op);),
(result = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
first,
last,
result,
binary_op);));
return result;
}

template <class Derived,
Expand Down
72 changes: 72 additions & 0 deletions thrust/system/cuda/detail/cdp_dispatch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/**
* \file
* Utilities for CUDA dynamic parallelism.
*/

#pragma once

#include <cub/config.cuh>
#include <cub/detail/detect_cuda_runtime.cuh>

#include <nv/target>

/**
* \def THRUST_CDP_DISPATCH
*
* If CUDA Dynamic Parallelism / CUDA Nested Parallelism is available, always
* run the parallel implementation. Otherwise, run the parallel implementation
* when called from the host, and fallback to the sequential implementation on
* the device.
*
* `par_impl` and `seq_impl` are blocks of C++ statements enclosed in
* parentheses, similar to NV_IF_TARGET blocks:
*
* \code
* THRUST_CDP_DISPATCH((launch_parallel_kernel();), (run_serial_impl();));
* \endcode
*/

#ifdef THRUST_RDC_ENABLED

// seq_impl unused.
#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
NV_IF_TARGET(NV_ANY_TARGET, par_impl)

#else // THRUST_RDC_ENABLED

// Special case for NVCC -- need to inform the device path about the kernels
// that are launched from the host path.
#if defined(__CUDACC__) && defined(__CUDA_ARCH__)

// Device-side launch not supported, fallback to sequential in device code.
#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
if (false) \
{ /* Without this, the device pass won't compile any kernels. */ \
NV_IF_TARGET(NV_ANY_TARGET, par_impl); \
} \
NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)

#else // NVCC device pass

#define THRUST_CDP_DISPATCH(par_impl, seq_impl) \
NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)

#endif // NVCC device pass

#endif // THRUST_RDC_ENABLED
45 changes: 16 additions & 29 deletions thrust/system/cuda/detail/copy.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@

#include <thrust/detail/config.h>

#include <thrust/advance.h>

#include <thrust/system/cuda/config.h>
#include <thrust/system/cuda/detail/cdp_dispatch.h>
#include <thrust/system/cuda/detail/execution_policy.h>
#include <thrust/system/cuda/detail/cross_system.h>

Expand Down Expand Up @@ -117,22 +120,11 @@ copy(execution_policy<System> &system,
InputIterator last,
OutputIterator result)
{
OutputIterator ret = result;
if (__THRUST_HAS_CUDART__)
{
ret = __copy::device_to_device(system, first, last, result);
}
else
{
#if !__THRUST_HAS_CUDART__
ret = thrust::copy(cvt_to_seq(derived_cast(system)),
first,
last,
result);
#endif
}

return ret;
THRUST_CDP_DISPATCH(
(result = __copy::device_to_device(system, first, last, result);),
(result =
thrust::copy(cvt_to_seq(derived_cast(system)), first, last, result);));
return result;
} // end copy()

__thrust_exec_check_disable__
Expand All @@ -146,19 +138,14 @@ copy_n(execution_policy<System> &system,
Size n,
OutputIterator result)
{
OutputIterator ret = result;
if (__THRUST_HAS_CUDART__)
{
ret = __copy::device_to_device(system, first, first + n, result);
}
else
{
#if !__THRUST_HAS_CUDART__
ret = thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);
#endif
}

return ret;
THRUST_CDP_DISPATCH(
(result = __copy::device_to_device(system,
first,
thrust::next(first, n),
result);),
(result =
thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);));
return result;
} // end copy_n()
#endif

Expand Down
Loading

0 comments on commit 967924e

Please sign in to comment.