Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Add libcu++ dependency; initial round of NV_IF_TARGET ports. #1605

Merged
merged 10 commits into from
May 17, 2022
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "cub"]
path = dependencies/cub
url = ../cub.git
[submodule "libcudacxx"]
path = dependencies/libcudacxx
url = ../libcudacxx.git
50 changes: 29 additions & 21 deletions cmake/ThrustInstallRules.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -24,27 +24,35 @@ configure_file("${Thrust_SOURCE_DIR}/thrust/cmake/thrust-header-search.cmake.in"
install(FILES "${Thrust_BINARY_DIR}/thrust/cmake/thrust-header-search.cmake"
DESTINATION "${install_location}")

# Depending on how Thrust is configured, CUB's CMake scripts may or may not be
# included, so maintain a set of CUB install rules in both projects. By default
# CUB headers are installed alongside Thrust -- this may be disabled by turning
# off THRUST_INSTALL_CUB_HEADERS.
option(THRUST_INSTALL_CUB_HEADERS "Include cub headers when installing." ON)
# Depending on how Thrust is configured, libcudacxx and CUB's CMake scripts may
# or may not be include()'d, so force include their install rules when requested.
# By default, these projects are installed alongside Thrust. This is controlled by
# THRUST_INSTALL_CUB_HEADERS and THRUST_INSTALL_LIBCUDACXX_HEADERS.
option(THRUST_INSTALL_CUB_HEADERS "Include CUB headers when installing." ON)
if (THRUST_INSTALL_CUB_HEADERS)
install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub"
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
FILES_MATCHING
PATTERN "*.cuh"
)
# Use a function to limit scope of the CUB_*_DIR vars:
function(_thrust_install_cub_headers)
# Fake these for the logic in CUBInstallRules.cmake:
set(CUB_SOURCE_DIR "${Thrust_SOURCE_DIR}/dependencies/cub/")
set(CUB_BINARY_DIR "${Thrust_BINARY_DIR}/cub-config/")
set(CUB_ENABLE_INSTALL_RULES ON)
set(CUB_IN_THRUST OFF)
include("${Thrust_SOURCE_DIR}/dependencies/cub/cmake/CubInstallRules.cmake")
endfunction()

# Need to configure a file to store THRUST_INSTALL_HEADER_INFIX
install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake/"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/cub"
PATTERN cub-header-search EXCLUDE
)
set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/cub")
configure_file("${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake/cub-header-search.cmake.in"
"${Thrust_BINARY_DIR}/dependencies/cub/cub/cmake/cub-header-search.cmake"
@ONLY)
install(FILES "${Thrust_BINARY_DIR}/dependencies/cub/cub/cmake/cub-header-search.cmake"
DESTINATION "${install_location}")
_thrust_install_cub_headers()
endif()

option(THRUST_INSTALL_LIBCUDACXX_HEADERS "Include libcudacxx headers when installing." ON)
if (THRUST_INSTALL_LIBCUDACXX_HEADERS)
# Use a function to limit scope of the libcudacxx_*_DIR vars:
function(_thrust_install_libcudacxx_headers)
# Fake these for the logic in libcudacxxInstallRules.cmake:
set(libcudacxx_SOURCE_DIR "${Thrust_SOURCE_DIR}/dependencies/libcudacxx/")
set(libcudacxx_BINARY_DIR "${Thrust_BINARY_DIR}/libcudacxx-config/")
set(libcudacxx_ENABLE_INSTALL_RULES ON)
include("${Thrust_SOURCE_DIR}/dependencies/libcudacxx/cmake/libcudacxxInstallRules.cmake")
endfunction()

_thrust_install_libcudacxx_headers()
endif()
2 changes: 1 addition & 1 deletion dependencies/cub
Submodule cub updated 63 files
+1 −6 cub/agent/agent_histogram.cuh
+1 −1 cub/agent/agent_rle.cuh
+1 −2 cub/agent/agent_segment_fixup.cuh
+1 −1 cub/agent/agent_spmv_orig.cuh
+24 −9 cub/agent/agent_sub_warp_merge_sort.cuh
+2 −2 cub/agent/single_pass_scan_operators.cuh
+3 −4 cub/block/block_adjacent_difference.cuh
+2 −2 cub/block/block_discontinuity.cuh
+4 −5 cub/block/block_exchange.cuh
+4 −16 cub/block/block_histogram.cuh
+7 −7 cub/block/block_load.cuh
+8 −10 cub/block/block_radix_rank.cuh
+7 −9 cub/block/block_radix_sort.cuh
+5 −5 cub/block/block_raking_layout.cuh
+5 −5 cub/block/block_reduce.cuh
+5 −5 cub/block/block_scan.cuh
+3 −3 cub/block/block_shuffle.cuh
+7 −7 cub/block/block_store.cuh
+4 −6 cub/block/specializations/block_histogram_sort.cuh
+3 −3 cub/block/specializations/block_reduce_raking.cuh
+5 −5 cub/block/specializations/block_reduce_raking_commutative_only.cuh
+3 −3 cub/block/specializations/block_reduce_warp_reductions.cuh
+3 −3 cub/block/specializations/block_scan_raking.cuh
+4 −4 cub/block/specializations/block_scan_warp_scans.cuh
+4 −4 cub/block/specializations/block_scan_warp_scans2.cuh
+4 −4 cub/block/specializations/block_scan_warp_scans3.cuh
+59 −8 cub/cmake/cub-config.cmake
+15 −16 cub/detail/device_synchronize.cuh
+36 −35 cub/device/dispatch/dispatch_histogram.cuh
+35 −18 cub/device/dispatch/dispatch_radix_sort.cuh
+1 −1 cub/device/dispatch/dispatch_reduce.cuh
+22 −29 cub/device/dispatch/dispatch_reduce_by_key.cuh
+23 −28 cub/device/dispatch/dispatch_rle.cuh
+88 −84 cub/device/dispatch/dispatch_segmented_sort.cuh
+24 −28 cub/device/dispatch/dispatch_select_if.cuh
+42 −47 cub/device/dispatch/dispatch_spmv_orig.cuh
+14 −19 cub/device/dispatch/dispatch_three_way_partition.cuh
+48 −50 cub/grid/grid_queue.cuh
+31 −37 cub/iterator/tex_obj_input_iterator.cuh
+5 −18 cub/thread/thread_load.cuh
+4 −12 cub/thread/thread_store.cuh
+41 −49 cub/util_arch.cuh
+96 −67 cub/util_debug.cuh
+109 −138 cub/util_device.cuh
+4 −6 cub/util_ptx.cuh
+0 −15 cub/util_type.cuh
+5 −5 cub/warp/specializations/warp_reduce_shfl.cuh
+4 −5 cub/warp/specializations/warp_reduce_smem.cuh
+4 −4 cub/warp/specializations/warp_scan_shfl.cuh
+3 −4 cub/warp/specializations/warp_scan_smem.cuh
+5 −6 cub/warp/warp_exchange.cuh
+5 −6 cub/warp/warp_load.cuh
+7 −7 cub/warp/warp_merge_sort.cuh
+6 −6 cub/warp/warp_reduce.cuh
+5 −5 cub/warp/warp_scan.cuh
+5 −6 cub/warp/warp_store.cuh
+3 −23 examples/block/example_block_reduce_dyn_smem.cu
+24 −6 test/test_device_merge_sort.cu
+53 −55 test/test_device_segmented_sort.cu
+4 −8 test/test_iterator.cu
+126 −140 test/test_util.h
+1 −2 test/test_warp_mask.cu
+6 −6 test/test_warp_reduce.cu
1 change: 1 addition & 0 deletions dependencies/libcudacxx
Submodule libcudacxx added at 05d48a
25 changes: 14 additions & 11 deletions testing/allocator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
#include <thrust/detail/config.h>
#include <thrust/device_malloc_allocator.h>
#include <thrust/system/cpp/vector.h>

#include <nv/target>

#include <memory>

template <typename T>
Expand Down Expand Up @@ -60,9 +63,12 @@ DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomCopyConstruct);
template <typename T>
struct my_allocator_with_custom_destroy
{
typedef T value_type;
typedef T & reference;
typedef const T & const_reference;
// This is only used with thrust::cpp::vector:
using system_type = thrust::cpp::tag;

using value_type = T;
using reference = T &;
using const_reference = const T &;

static bool g_state;

Expand All @@ -80,9 +86,7 @@ struct my_allocator_with_custom_destroy
__host__ __device__
void destroy(T *)
{
#if !__CUDA_ARCH__
g_state = true;
#endif
NV_IF_TARGET(NV_IS_HOST, (g_state = true;));
alliepiper marked this conversation as resolved.
Show resolved Hide resolved
}

value_type *allocate(std::ptrdiff_t n)
Expand Down Expand Up @@ -119,12 +123,14 @@ bool my_allocator_with_custom_destroy<T>::g_state = false;
template <typename T>
void TestAllocatorCustomDestroy(size_t n)
{
my_allocator_with_custom_destroy<T>::g_state = false;

{
thrust::cpp::vector<T, my_allocator_with_custom_destroy<T> > vec(n);
} // destroy everything

if (0 < n)
ASSERT_EQUAL(true, my_allocator_with_custom_destroy<T>::g_state);
// state should only be true when there are values to destroy:
ASSERT_EQUAL(n > 0, my_allocator_with_custom_destroy<T>::g_state);
}
DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomDestroy);

Expand Down Expand Up @@ -203,7 +209,6 @@ void TestAllocatorTraitsRebind()
}
DECLARE_UNITTEST(TestAllocatorTraitsRebind);

#if THRUST_CPP_DIALECT >= 2011
void TestAllocatorTraitsRebindCpp11()
{
ASSERT_EQUAL(
Expand Down Expand Up @@ -251,5 +256,3 @@ void TestAllocatorTraitsRebindCpp11()
);
}
DECLARE_UNITTEST(TestAllocatorTraitsRebindCpp11);
#endif // C++11

22 changes: 6 additions & 16 deletions testing/cuda/pair_sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,11 @@
#include <thrust/execution_policy.h>


template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
template<typename ExecutionPolicy, typename Iterator>
__global__
void stable_sort_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 is_supported)
void stable_sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
thrust::stable_sort(exec, first, last);
#else
*is_supported = false;
#endif
}


Expand Down Expand Up @@ -43,19 +38,14 @@ void TestPairStableSortDevice(ExecutionPolicy exec)

thrust::device_vector<P> d_pairs = h_pairs;

thrust::device_vector<bool> is_supported(1);

stable_sort_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), is_supported.begin());
stable_sort_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end());
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);

if(is_supported[0])
{
// sort on the host
thrust::stable_sort(h_pairs.begin(), h_pairs.end());
// sort on the host
thrust::stable_sort(h_pairs.begin(), h_pairs.end());

ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
}
ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
};


Expand Down
24 changes: 7 additions & 17 deletions testing/cuda/pair_sort_by_key.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,11 @@
#include <thrust/execution_policy.h>


template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
__global__
void stable_sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 is_supported)
void stable_sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first);
#else
*is_supported = false;
#endif
}


Expand Down Expand Up @@ -51,21 +46,16 @@ void TestPairStableSortByKeyDevice(ExecutionPolicy exec)
thrust::device_vector<P> d_pairs = h_pairs;
thrust::device_vector<int> d_values = h_values;

thrust::device_vector<bool> is_supported(1);

// sort on the device
stable_sort_by_key_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), d_values.begin(), is_supported.begin());
stable_sort_by_key_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), d_values.begin());
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);

if(is_supported[0])
{
// sort on the host
thrust::stable_sort_by_key(h_pairs.begin(), h_pairs.end(), h_values.begin());
// sort on the host
thrust::stable_sort_by_key(h_pairs.begin(), h_pairs.end(), h_values.begin());

ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
ASSERT_EQUAL(h_values, d_values);
}
ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
ASSERT_EQUAL(h_values, d_values);
};


Expand Down
66 changes: 24 additions & 42 deletions testing/cuda/partition.cu
Original file line number Diff line number Diff line change
Expand Up @@ -286,16 +286,11 @@ void TestPartitionCopyStencilDeviceNoSync()
DECLARE_UNITTEST(TestPartitionCopyStencilDeviceNoSync);


template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2, typename Iterator3>
template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2>
__global__
void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result, Iterator3 is_supported)
void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
*result = thrust::stable_partition(exec, first, last, pred);
#else
*is_supported = false;
#endif
}


Expand All @@ -313,24 +308,20 @@ void TestStablePartitionDevice(ExecutionPolicy exec)
data[4] = 2;

thrust::device_vector<iterator> result(1);
thrust::device_vector<bool> is_supported(1);

stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even<T>(), result.begin(), is_supported.begin());

stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even<T>(), result.begin());
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);

if(is_supported[0])
{
thrust::device_vector<T> ref(5);
ref[0] = 2;
ref[1] = 2;
ref[2] = 1;
ref[3] = 1;
ref[4] = 1;
thrust::device_vector<T> ref(5);
ref[0] = 2;
ref[1] = 2;
ref[2] = 1;
ref[3] = 1;
ref[4] = 1;

ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
ASSERT_EQUAL(ref, data);
}
ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
ASSERT_EQUAL(ref, data);
}


Expand All @@ -355,16 +346,11 @@ void TestStablePartitionDeviceNoSync()
DECLARE_UNITTEST(TestStablePartitionDeviceNoSync);


template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3, typename Iterator4>
template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
__global__
void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result, Iterator4 is_supported)
void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
*result = thrust::stable_partition(exec, first, last, stencil_first, pred);
#else
*is_supported = false;
#endif
}


Expand All @@ -389,24 +375,20 @@ void TestStablePartitionStencilDevice(ExecutionPolicy exec)
stencil[4] = 2;

thrust::device_vector<iterator> result(1);
thrust::device_vector<bool> is_supported(1);

stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even<T>(), result.begin(), is_supported.begin());

stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even<T>(), result.begin());
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);

if(is_supported[0])
{
thrust::device_vector<T> ref(5);
ref[0] = 1;
ref[1] = 1;
ref[2] = 0;
ref[3] = 0;
ref[4] = 0;
thrust::device_vector<T> ref(5);
ref[0] = 1;
ref[1] = 1;
ref[2] = 0;
ref[3] = 0;
ref[4] = 0;

ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
ASSERT_EQUAL(ref, data);
}
ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
ASSERT_EQUAL(ref, data);
}


Expand Down
24 changes: 7 additions & 17 deletions testing/cuda/sort.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,11 @@
#include <thrust/execution_policy.h>


template<typename ExecutionPolicy, typename Iterator, typename Compare, typename Iterator2>
template<typename ExecutionPolicy, typename Iterator, typename Compare>
__global__
void sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Compare comp, Iterator2 is_supported)
void sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Compare comp)
{
#if (__CUDA_ARCH__ >= 200)
*is_supported = true;
thrust::sort(exec, first, last, comp);
#else
*is_supported = false;
#endif
}


Expand All @@ -34,19 +29,14 @@ void TestComparisonSortDevice(ExecutionPolicy exec, const size_t n, Compare comp
thrust::host_vector<T> h_data = unittest::random_integers<T>(n);
thrust::device_vector<T> d_data = h_data;

thrust::device_vector<bool> is_supported(1);

sort_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), comp, is_supported.begin());
sort_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), comp);
cudaError_t const err = cudaDeviceSynchronize();
ASSERT_EQUAL(cudaSuccess, err);


if(is_supported[0])
{
thrust::sort(h_data.begin(), h_data.end(), comp);

ASSERT_EQUAL(h_data, d_data);
}
thrust::sort(h_data.begin(), h_data.end(), comp);

ASSERT_EQUAL(h_data, d_data);
};


Expand Down Expand Up @@ -163,7 +153,7 @@ void TestComparisonSortCudaStreams()
cudaStreamSynchronize(s);

ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end(), my_less<int>()));

cudaStreamDestroy(s);
}
DECLARE_UNITTEST(TestComparisonSortCudaStreams);
Expand Down
Loading