diff --git a/dependencies/cub b/dependencies/cub index 6552e4d42..1caaac184 160000 --- a/dependencies/cub +++ b/dependencies/cub @@ -1 +1 @@ -Subproject commit 6552e4d429c194e11962feb638abf87bcf220af0 +Subproject commit 1caaac18483cfae068ad8e4aed1780a27c6be0b9 diff --git a/thrust/detail/allocator/temporary_allocator.inl b/thrust/detail/allocator/temporary_allocator.inl index 69d1d100a..87d77d576 100644 --- a/thrust/detail/allocator/temporary_allocator.inl +++ b/thrust/detail/allocator/temporary_allocator.inl @@ -20,7 +20,7 @@ #include #include -#if defined(__CUDA_ARCH__) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA +#if (defined(__PGI_CUDA__) || defined(__CUDA_ARCH__)) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA #include #endif @@ -45,11 +45,15 @@ __host__ __device__ // note that we pass cnt to deallocate, not a value derived from result.second deallocate(result.first, cnt); -#if !defined(__CUDA_ARCH__) - throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed"); -#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA - thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed"); -#endif + if (THRUST_IS_HOST_CODE) { + #if THRUST_INCLUDE_HOST_CODE + throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed"); + #endif + } else { + #if THRUST_INCLUDE_DEVICE_CODE + thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed"); + #endif + } } // end if return result.first; diff --git a/thrust/detail/config/cpp_compatibility.h b/thrust/detail/config/cpp_compatibility.h index 5d48d6152..b304eb2c0 100644 --- a/thrust/detail/config/cpp_compatibility.h +++ b/thrust/detail/config/cpp_compatibility.h @@ -34,6 +34,10 @@ # define THRUST_DEFAULT = default; # define THRUST_NOEXCEPT noexcept # define THRUST_FINAL final +// THRUST_STATIC_CONSTANT is a holdover from an earlier Thrust version, and is +// here only because we are using a hybrid of Thrust versions. Don't push this +// back to Thrust. +# define THRUST_STATIC_CONSTANT static constexpr #else # define THRUST_CONSTEXPR # define THRUST_OVERRIDE @@ -49,13 +53,14 @@ // FIXME: Combine THRUST_INLINE_CONSTANT and // THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT into one macro when NVCC properly // supports `constexpr` globals in host and device code. -#ifdef __CUDA_ARCH__ +// NVC++ uses the same definitions as NVCC does for device code. +#if defined(__CUDA_ARCH__) || defined(__PGI_CUDA__) // FIXME: Add this when NVCC supports inline variables. //# if THRUST_CPP_DIALECT >= 2017 //# define THRUST_INLINE_CONSTANT inline constexpr //# define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT inline constexpr # if THRUST_CPP_DIALECT >= 2011 -# define THRUST_INLINE_CONSTANT static constexpr +# define THRUST_INLINE_CONSTANT static const __device__ # define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static constexpr # else # define THRUST_INLINE_CONSTANT static const __device__ @@ -75,3 +80,20 @@ # endif #endif +#if defined(__PGI_CUDA__) +# define THRUST_IS_DEVICE_CODE __builtin_is_device_code() +# define THRUST_IS_HOST_CODE (!__builtin_is_device_code()) +# define THRUST_INCLUDE_DEVICE_CODE 1 +# define THRUST_INCLUDE_HOST_CODE 1 +#elif defined(__CUDA_ARCH__) +# define THRUST_IS_DEVICE_CODE 1 +# define THRUST_IS_HOST_CODE 0 +# define THRUST_INCLUDE_DEVICE_CODE 1 +# define THRUST_INCLUDE_HOST_CODE 0 +#else +# define THRUST_IS_DEVICE_CODE 0 +# define THRUST_IS_HOST_CODE 1 +# define THRUST_INCLUDE_DEVICE_CODE 0 +# define THRUST_INCLUDE_HOST_CODE 1 +#endif + diff --git a/thrust/detail/config/exec_check_disable.h b/thrust/detail/config/exec_check_disable.h index dcadaf141..50e8269a0 100644 --- a/thrust/detail/config/exec_check_disable.h +++ b/thrust/detail/config/exec_check_disable.h @@ -22,7 +22,11 @@ #include -#if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__)) +/* pragma nv_exec_check_disable only works with NVCC, not with PGI or Clang. + Having a macro expand to a #pragma (rather than _Pragma) only works with + NVCC's compilation model, not with other compilers. */ +#if defined(__CUDACC__) && !defined(__PGI_CUDA__) && \ + !(defined(__CUDA__) && defined(__clang__)) #define __thrust_exec_check_disable__ #pragma nv_exec_check_disable diff --git a/thrust/detail/contiguous_storage.inl b/thrust/detail/contiguous_storage.inl index 2556260f2..efce89130 100644 --- a/thrust/detail/contiguous_storage.inl +++ b/thrust/detail/contiguous_storage.inl @@ -428,15 +428,19 @@ __host__ __device__ void contiguous_storage ::swap_allocators(false_type, Alloc &other) { -#ifdef __CUDA_ARCH__ - // allocators must be equal when swapping containers with allocators that propagate on swap - assert(!is_allocator_not_equal(other)); -#else - if (is_allocator_not_equal(other)) - { - throw allocator_mismatch_on_swap(); + if (THRUST_IS_DEVICE_CODE) { + #if THRUST_INCLUDE_DEVICE_CODE + // allocators must be equal when swapping containers with allocators that propagate on swap + assert(!is_allocator_not_equal(other)); + #endif + } else { + #if THRUST_INCLUDE_HOST_CODE + if (is_allocator_not_equal(other)) + { + throw allocator_mismatch_on_swap(); + } + #endif } -#endif thrust::swap(m_allocator, other); } // end contiguous_storage::swap_allocators() diff --git a/thrust/detail/functional/actor.h b/thrust/detail/functional/actor.h index 666de09ee..af85b972d 100644 --- a/thrust/detail/functional/actor.h +++ b/thrust/detail/functional/actor.h @@ -52,7 +52,7 @@ template typedef Eval eval_type; __host__ __device__ - actor(void); + THRUST_CONSTEXPR actor(void); __host__ __device__ actor(const Eval &base); diff --git a/thrust/detail/functional/actor.inl b/thrust/detail/functional/actor.inl index e09dd4800..34c7235b5 100644 --- a/thrust/detail/functional/actor.inl +++ b/thrust/detail/functional/actor.inl @@ -38,7 +38,7 @@ namespace functional template __host__ __device__ - actor + THRUST_CONSTEXPR actor ::actor(void) : eval_type() {} diff --git a/thrust/detail/functional/argument.h b/thrust/detail/functional/argument.h index 88b48a6d2..188704938 100644 --- a/thrust/detail/functional/argument.h +++ b/thrust/detail/functional/argument.h @@ -59,7 +59,7 @@ template }; __host__ __device__ - argument(void){} + THRUST_CONSTEXPR argument(void){} template __host__ __device__ diff --git a/thrust/detail/integer_math.h b/thrust/detail/integer_math.h index d64577c68..f2495c0b2 100644 --- a/thrust/detail/integer_math.h +++ b/thrust/detail/integer_math.h @@ -32,22 +32,27 @@ template __host__ __device__ __thrust_forceinline__ Integer clz(Integer x) { -#if __CUDA_ARCH__ - return ::__clz(x); -#else - int num_bits = 8 * sizeof(Integer); - int num_bits_minus_one = num_bits - 1; - - for (int i = num_bits_minus_one; i >= 0; --i) - { - if ((Integer(1) << i) & x) - { - return num_bits_minus_one - i; - } + Integer result; + if (THRUST_IS_DEVICE_CODE) { + #if THRUST_INCLUDE_DEVICE_CODE + result = ::__clz(x); + #endif + } else { + #if THRUST_INCLUDE_HOST_CODE + int num_bits = 8 * sizeof(Integer); + int num_bits_minus_one = num_bits - 1; + result = num_bits; + for (int i = num_bits_minus_one; i >= 0; --i) + { + if ((Integer(1) << i) & x) + { + result = num_bits_minus_one - i; + break; + } + } + #endif } - - return num_bits; -#endif + return result; } template diff --git a/thrust/detail/seq.h b/thrust/detail/seq.h index ecc1d8dd5..b548652d2 100644 --- a/thrust/detail/seq.h +++ b/thrust/detail/seq.h @@ -31,7 +31,7 @@ struct seq_t : thrust::system::detail::sequential::execution_policy, thrust::system::detail::sequential::execution_policy> { __host__ __device__ - seq_t() : thrust::system::detail::sequential::execution_policy() {} + THRUST_CONSTEXPR seq_t() : thrust::system::detail::sequential::execution_policy() {} // allow any execution_policy to convert to seq_t template @@ -45,11 +45,7 @@ struct seq_t : thrust::system::detail::sequential::execution_policy, } // end detail -#ifdef __CUDA_ARCH__ -static const __device__ detail::seq_t seq; -#else -static const detail::seq_t seq; -#endif +THRUST_INLINE_CONSTANT detail::seq_t seq; } // end thrust diff --git a/thrust/execution_policy.h b/thrust/execution_policy.h index ef1a5d853..4c1380d6e 100644 --- a/thrust/execution_policy.h +++ b/thrust/execution_policy.h @@ -27,6 +27,8 @@ //! \cond +//! \cond + // #include the host system's execution_policy header #define __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER <__THRUST_HOST_SYSTEM_ROOT/execution_policy.h> #include __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER @@ -39,6 +41,8 @@ //! \endcond +//! \endcond + namespace thrust { @@ -344,11 +348,7 @@ static const detail::host_t host; * \see host_execution_policy * \see thrust::device */ -#ifdef __CUDA_ARCH__ -static const __device__ detail::device_t device; -#else -static const detail::device_t device; -#endif +THRUST_INLINE_CONSTANT detail::device_t device; // define seq for the purpose of Doxygenating it diff --git a/thrust/functional.h b/thrust/functional.h index ec8c62104..a550afddb 100644 --- a/thrust/functional.h +++ b/thrust/functional.h @@ -1448,92 +1448,52 @@ namespace placeholders /*! \p thrust::placeholders::_1 is the placeholder for the first function parameter. */ -#ifdef __CUDA_ARCH__ -static const __device__ thrust::detail::functional::placeholder<0>::type _1; -#else -static const thrust::detail::functional::placeholder<0>::type _1; -#endif +THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<0>::type _1; /*! \p thrust::placeholders::_2 is the placeholder for the second function parameter. */ -#ifdef __CUDA_ARCH__ -static const __device__ thrust::detail::functional::placeholder<1>::type _2; -#else -static const thrust::detail::functional::placeholder<1>::type _2; -#endif +THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<1>::type _2; /*! \p thrust::placeholders::_3 is the placeholder for the third function parameter. */ -#ifdef __CUDA_ARCH__ -static const __device__ thrust::detail::functional::placeholder<2>::type _3; -#else -static const thrust::detail::functional::placeholder<2>::type _3; -#endif +THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<2>::type _3; /*! \p thrust::placeholders::_4 is the placeholder for the fourth function parameter. */ -#ifdef __CUDA_ARCH__ -static const __device__ thrust::detail::functional::placeholder<3>::type _4; -#else -static const thrust::detail::functional::placeholder<3>::type _4; -#endif +THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<3>::type _4; /*! \p thrust::placeholders::_5 is the placeholder for the fifth function parameter. */ -#ifdef __CUDA_ARCH__ -static const __device__ thrust::detail::functional::placeholder<4>::type _5; -#else -static const thrust::detail::functional::placeholder<4>::type _5; -#endif +THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<4>::type _5; /*! \p thrust::placeholders::_6 is the placeholder for the sixth function parameter. */ -#ifdef __CUDA_ARCH__ -static const __device__ thrust::detail::functional::placeholder<5>::type _6; -#else -static const thrust::detail::functional::placeholder<5>::type _6; -#endif +THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<5>::type _6; /*! \p thrust::placeholders::_7 is the placeholder for the seventh function parameter. */ -#ifdef __CUDA_ARCH__ -static const __device__ thrust::detail::functional::placeholder<6>::type _7; -#else -static const thrust::detail::functional::placeholder<6>::type _7; -#endif +THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<6>::type _7; /*! \p thrust::placeholders::_8 is the placeholder for the eighth function parameter. */ -#ifdef __CUDA_ARCH__ -static const __device__ thrust::detail::functional::placeholder<7>::type _8; -#else -static const thrust::detail::functional::placeholder<7>::type _8; -#endif +THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<7>::type _8; /*! \p thrust::placeholders::_9 is the placeholder for the ninth function parameter. */ -#ifdef __CUDA_ARCH__ -static const __device__ thrust::detail::functional::placeholder<8>::type _9; -#else -static const thrust::detail::functional::placeholder<8>::type _9; -#endif +THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<8>::type _9; /*! \p thrust::placeholders::_10 is the placeholder for the tenth function parameter. */ -#ifdef __CUDA_ARCH__ -static const __device__ thrust::detail::functional::placeholder<9>::type _10; -#else -static const thrust::detail::functional::placeholder<9>::type _10; -#endif +THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<9>::type _10; } // end placeholders diff --git a/thrust/system/cpp/detail/par.h b/thrust/system/cpp/detail/par.h index d721799d7..740c39e8b 100644 --- a/thrust/system/cpp/detail/par.h +++ b/thrust/system/cpp/detail/par.h @@ -35,14 +35,14 @@ struct par_t : thrust::system::cpp::detail::execution_policy, thrust::system::cpp::detail::execution_policy> { __host__ __device__ - par_t() : thrust::system::cpp::detail::execution_policy() {} + THRUST_CONSTEXPR par_t() : thrust::system::cpp::detail::execution_policy() {} }; } // end detail -static const detail::par_t par; +THRUST_INLINE_CONSTANT detail::par_t par; } // end cpp diff --git a/thrust/system/cuda/detail/assign_value.h b/thrust/system/cuda/detail/assign_value.h index 601700cb5..3b0a25ff8 100644 --- a/thrust/system/cuda/detail/assign_value.h +++ b/thrust/system/cuda/detail/assign_value.h @@ -46,11 +46,15 @@ inline __host__ __device__ } }; -#ifndef __CUDA_ARCH__ - war_nvbugs_881631::host_path(exec,dst,src); -#else - war_nvbugs_881631::device_path(exec,dst,src); -#endif // __CUDA_ARCH__ + if (THRUST_IS_HOST_CODE) { + #if THRUST_INCLUDE_HOST_CODE + war_nvbugs_881631::host_path(exec,dst,src); + #endif + } else { + #if THRUST_INCLUDE_DEVICE_CODE + war_nvbugs_881631::device_path(exec,dst,src); + #endif + } } // end assign_value() @@ -78,11 +82,15 @@ inline __host__ __device__ } }; -#if __CUDA_ARCH__ - war_nvbugs_881631::device_path(systems,dst,src); -#else - war_nvbugs_881631::host_path(systems,dst,src); -#endif + if (THRUST_IS_DEVICE_CODE) { + #if THRUST_INCLUDE_DEVICE_CODE + war_nvbugs_881631::device_path(systems,dst,src); + #endif + } else { + #if THRUST_INCLUDE_HOST_CODE + war_nvbugs_881631::host_path(systems,dst,src); + #endif + } } // end assign_value() diff --git a/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/system/cuda/detail/core/agent_launcher.h index a54974e6d..22308d5f1 100644 --- a/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/system/cuda/detail/core/agent_launcher.h @@ -47,7 +47,7 @@ namespace cuda_cub { namespace core { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) || defined(__PGI_CUDA__) #if 0 template void __global__ @@ -518,11 +518,15 @@ namespace core { { if (debug_sync) { -#ifdef __CUDA_ARCH__ - cudaDeviceSynchronize(); -#else - cudaStreamSynchronize(stream); -#endif + if (THRUST_IS_DEVICE_CODE) { + #if THRUST_INCLUDE_DEVICE_CODE + cudaDeviceSynchronize(); + #endif + } else { + #if THRUST_INCLUDE_HOST_CODE + cudaStreamSynchronize(stream); + #endif + } } } diff --git a/thrust/system/cuda/detail/core/triple_chevron_launch.h b/thrust/system/cuda/detail/core/triple_chevron_launch.h index 5eabad455..4be160a35 100644 --- a/thrust/system/cuda/detail/core/triple_chevron_launch.h +++ b/thrust/system/cuda/detail/core/triple_chevron_launch.h @@ -834,10 +834,14 @@ namespace launcher { } -#ifdef __CUDA_ARCH__ -#define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE doit_device +#if defined(__PGI_CUDA__) +# define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE(...) \ + (__builtin_is_device_code() ? \ + doit_device(__VA_ARGS__) : doit_host(__VA_ARGS__)) +#elif defined(__CUDA_ARCH__) +# define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE doit_device #else -#define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE doit_host +# define THRUST_TRIPLE_LAUNCHER_HOSTDEVICE doit_host #endif #if 0 diff --git a/thrust/system/cuda/detail/core/util.h b/thrust/system/cuda/detail/core/util.h index a917244ef..cf7d6c4e5 100644 --- a/thrust/system/cuda/detail/core/util.h +++ b/thrust/system/cuda/detail/core/util.h @@ -41,16 +41,28 @@ THRUST_BEGIN_NS namespace cuda_cub { namespace core { -#if (__CUDA_ARCH__ >= 600) -# define THRUST_TUNING_ARCH sm60 -#elif (__CUDA_ARCH__ >= 520) -# define THRUST_TUNING_ARCH sm52 -#elif (__CUDA_ARCH__ >= 350) -# define THRUST_TUNING_ARCH sm35 -#elif (__CUDA_ARCH__ >= 300) -# define THRUST_TUNING_ARCH sm30 -#elif !defined (__CUDA_ARCH__) -# define THRUST_TUNING_ARCH sm30 +#ifdef __PGI_CUDA__ +# if (__PGI_CUDA_ARCH__ >= 600) +# define THRUST_TUNING_ARCH sm60 +# elif (__PGI_CUDA_ARCH__ >= 520) +# define THRUST_TUNING_ARCH sm52 +# elif (__PGI_CUDA_ARCH__ >= 350) +# define THRUST_TUNING_ARCH sm35 +# else +# define THRUST_TUNING_ARCH sm30 +# endif +#else +# if (__CUDA_ARCH__ >= 600) +# define THRUST_TUNING_ARCH sm60 +# elif (__CUDA_ARCH__ >= 520) +# define THRUST_TUNING_ARCH sm52 +# elif (__CUDA_ARCH__ >= 350) +# define THRUST_TUNING_ARCH sm35 +# elif (__CUDA_ARCH__ >= 300) +# define THRUST_TUNING_ARCH sm30 +# elif !defined (__CUDA_ARCH__) +# define THRUST_TUNING_ARCH sm30 +# endif #endif // Typelist - a container of types, supports up to 10 types @@ -341,14 +353,30 @@ namespace core { typename get_plan::type THRUST_RUNTIME_FUNCTION get_agent_plan(int ptx_version) { -#if (CUB_PTX_ARCH > 0) && defined(__THRUST_HAS_CUDART__) - typedef typename get_plan::type Plan; - THRUST_UNUSED_VAR(ptx_version); - // We're on device, use default policy - return Plan(typename Agent::ptx_plan()); -#else - return get_agent_plan_impl::get(ptx_version); -#endif + // Use one path, with Agent::ptx_plan, for device code where device-side + // kernel launches are supported. The other path, with + // get_agent_plan_impl::get(version), is for host code and for device + // code without device-side kernel launches. PGI and NVCC check for + // these situations differently. + #ifdef __PGI_CUDA__ + #ifdef __THRUST_HAS_CUDART__ + if (CUB_IS_DEVICE_CODE) { + return typename get_plan::type(typename Agent::ptx_plan()); + } else + #endif + { + return get_agent_plan_impl::get(ptx_version); + } + #else + #if (CUB_PTX_ARCH > 0) && defined(__THRUST_HAS_CUDART__) + typedef typename get_plan::type Plan; + THRUST_UNUSED_VAR(ptx_version); + // We're on device, use default policy + return Plan(typename Agent::ptx_plan()); + #else + return get_agent_plan_impl::get(ptx_version); + #endif + #endif } // XXX keep this dead-code for now as a gentle reminder diff --git a/thrust/system/cuda/detail/extrema.h b/thrust/system/cuda/detail/extrema.h index 746565f34..faef53999 100644 --- a/thrust/system/cuda/detail/extrema.h +++ b/thrust/system/cuda/detail/extrema.h @@ -127,8 +127,8 @@ namespace __extrema { pair_type const &lhs_min = get<0>(lhs); pair_type const &rhs_max = get<1>(rhs); pair_type const &lhs_max = get<1>(lhs); - return make_tuple(arg_min_t(predicate)(lhs_min, rhs_min), - arg_max_t(predicate)(lhs_max, rhs_max)); + return thrust::make_tuple(arg_min_t(predicate)(lhs_min, rhs_min), + arg_max_t(predicate)(lhs_max, rhs_max)); } struct duplicate_tuple @@ -385,7 +385,7 @@ namespace __extrema { typedef tuple > iterator_tuple; typedef zip_iterator zip_iterator; - iterator_tuple iter_tuple = make_tuple(first, counting_iterator_t(0)); + iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator_t(0)); typedef ArgFunctor arg_min_t; @@ -518,7 +518,7 @@ minmax_element(execution_policy &policy, typedef tuple > iterator_tuple; typedef zip_iterator zip_iterator; - iterator_tuple iter_tuple = make_tuple(first, counting_iterator_t(0)); + iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator_t(0)); typedef __extrema::arg_minmax_f arg_minmax_t; diff --git a/thrust/system/cuda/detail/get_value.h b/thrust/system/cuda/detail/get_value.h index 68b987dde..019082dcd 100644 --- a/thrust/system/cuda/detail/get_value.h +++ b/thrust/system/cuda/detail/get_value.h @@ -61,11 +61,17 @@ inline __host__ __device__ } }; -#ifndef __CUDA_ARCH__ - return war_nvbugs_881631::host_path(exec, ptr); -#else - return war_nvbugs_881631::device_path(exec, ptr); -#endif // __CUDA_ARCH__ + result_type result; + if (THRUST_IS_HOST_CODE) { + #if THRUST_INCLUDE_HOST_CODE + result = war_nvbugs_881631::host_path(exec, ptr); + #endif + } else { + #if THRUST_INCLUDE_DEVICE_CODE + result = war_nvbugs_881631::device_path(exec, ptr); + #endif + } + return result; } // end get_value_msvc2005_war() diff --git a/thrust/system/cuda/detail/iter_swap.h b/thrust/system/cuda/detail/iter_swap.h index ec545b056..ac224c042 100644 --- a/thrust/system/cuda/detail/iter_swap.h +++ b/thrust/system/cuda/detail/iter_swap.h @@ -48,11 +48,15 @@ void iter_swap(thrust::cuda::execution_policy &, Pointer1 a, Poin } }; -#ifndef __CUDA_ARCH__ - return war_nvbugs_881631::host_path(a, b); -#else - return war_nvbugs_881631::device_path(a, b); -#endif // __CUDA_ARCH__ + if (THRUST_IS_HOST_CODE) { + #if THRUST_INCLUDE_HOST_CODE + war_nvbugs_881631::host_path(a, b); + #endif + } else { + #if THRUST_INCLUDE_DEVICE_CODE + war_nvbugs_881631::device_path(a, b); + #endif + } } // end iter_swap() diff --git a/thrust/system/cuda/detail/malloc_and_free.h b/thrust/system/cuda/detail/malloc_and_free.h index e954479c7..ed6cb87b2 100644 --- a/thrust/system/cuda/detail/malloc_and_free.h +++ b/thrust/system/cuda/detail/malloc_and_free.h @@ -52,22 +52,26 @@ void *malloc(execution_policy &, std::size_t n) { void *result = 0; -#ifndef __CUDA_ARCH__ -#ifdef __CUB_CACHING_MALLOC - cub::CachingDeviceAllocator &alloc = get_allocator(); - cudaError_t status = alloc.DeviceAllocate(&result, n); -#else - cudaError_t status = cudaMalloc(&result, n); -#endif + if (THRUST_IS_HOST_CODE) { + #if THRUST_INCLUDE_HOST_CODE + #ifdef __CUB_CACHING_MALLOC + cub::CachingDeviceAllocator &alloc = get_allocator(); + cudaError_t status = alloc.DeviceAllocate(&result, n); + #else + cudaError_t status = cudaMalloc(&result, n); + #endif - if(status != cudaSuccess) - { - cudaGetLastError(); // Clear global CUDA error state. - throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str()); + if(status != cudaSuccess) + { + cudaGetLastError(); // Clear global CUDA error state. + throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str()); + } + #endif + } else { + #if THRUST_INCLUDE_DEVICE_CODE + result = thrust::raw_pointer_cast(thrust::malloc(thrust::seq, n)); + #endif } -#else - result = thrust::raw_pointer_cast(thrust::malloc(thrust::seq, n)); -#endif return result; } // end malloc() @@ -77,17 +81,21 @@ template __host__ __device__ void free(execution_policy &, Pointer ptr) { -#ifndef __CUDA_ARCH__ -#ifdef __CUB_CACHING_MALLOC - cub::CachingDeviceAllocator &alloc = get_allocator(); - cudaError_t status = alloc.DeviceFree(thrust::raw_pointer_cast(ptr)); -#else - cudaError_t status = cudaFree(thrust::raw_pointer_cast(ptr)); -#endif - cuda_cub::throw_on_error(status, "device free failed"); -#else - thrust::free(thrust::seq, ptr); -#endif + if (THRUST_IS_HOST_CODE) { + #if THRUST_INCLUDE_HOST_CODE + #ifdef __CUB_CACHING_MALLOC + cub::CachingDeviceAllocator &alloc = get_allocator(); + cudaError_t status = alloc.DeviceFree(thrust::raw_pointer_cast(ptr)); + #else + cudaError_t status = cudaFree(thrust::raw_pointer_cast(ptr)); + #endif + cuda_cub::throw_on_error(status, "device free failed"); + #endif + } else { + #if THRUST_INCLUDE_DEVICE_CODE + thrust::free(thrust::seq, ptr); + #endif + } } // end free() } // namespace cuda_cub diff --git a/thrust/system/cuda/detail/mismatch.h b/thrust/system/cuda/detail/mismatch.h index 845c93723..5854be3ac 100644 --- a/thrust/system/cuda/detail/mismatch.h +++ b/thrust/system/cuda/detail/mismatch.h @@ -87,8 +87,8 @@ mismatch(execution_policy& policy, transform_first + thrust::distance(first1, last1), identity()); - return make_pair(first1 + thrust::distance(transform_first,result), - first2 + thrust::distance(transform_first,result)); + return thrust::make_pair(first1 + thrust::distance(transform_first,result), + first2 + thrust::distance(transform_first,result)); } template cudaError_t synchronize_stream(execute_on_stream_base &exec) { - #if !__CUDA_ARCH__ - cudaStreamSynchronize(exec.stream); - return cudaGetLastError(); - #elif __THRUST_HAS_CUDART__ - THRUST_UNUSED_VAR(exec); - cudaDeviceSynchronize(); - return cudaGetLastError(); - #else - THRUST_UNUSED_VAR(exec); - return cudaSuccess; - #endif + cudaError_t result; + if (THRUST_IS_HOST_CODE) { + #if THRUST_INCLUDE_HOST_CODE + cudaStreamSynchronize(exec.stream); + result = cudaGetLastError(); + #endif + } else { + #if THRUST_INCLUDE_DEVICE_CODE + #if __THRUST_HAS_CUDART__ + THRUST_UNUSED_VAR(exec); + cudaDeviceSynchronize(); + result = cudaGetLastError(); + #else + THRUST_UNUSED_VAR(exec); + result = cudaSuccess; + #endif + #endif + } + return result; } }; @@ -109,7 +117,7 @@ struct par_t : execution_policy, typedef execution_policy base_t; __host__ __device__ - par_t() : base_t() {} + THRUST_CONSTEXPR par_t() : base_t() {} typedef execute_on_stream stream_attachment_type; @@ -121,11 +129,7 @@ struct par_t : execution_policy, } }; -#ifdef __CUDA_ARCH__ -static const __device__ par_t par; -#else -static const par_t par; -#endif +THRUST_INLINE_CONSTANT par_t par; } // namespace cuda_ namespace system { diff --git a/thrust/system/cuda/detail/reverse.h b/thrust/system/cuda/detail/reverse.h index 4ce432683..4c2ea42ac 100644 --- a/thrust/system/cuda/detail/reverse.h +++ b/thrust/system/cuda/detail/reverse.h @@ -85,7 +85,7 @@ reverse(execution_policy &policy, // find the midpoint of [first,last) difference_type N = thrust::distance(first, last); ItemsIt mid(first); - advance(mid, N / 2); + thrust::advance(mid, N / 2); cuda_cub::swap_ranges(policy, first, mid, make_reverse_iterator(last)); } diff --git a/thrust/system/cuda/detail/util.h b/thrust/system/cuda/detail/util.h index 64aa03420..3a267c541 100644 --- a/thrust/system/cuda/detail/util.h +++ b/thrust/system/cuda/detail/util.h @@ -148,11 +148,15 @@ trivial_copy_device_to_device(Policy & policy, inline void __host__ __device__ terminate() { -#ifdef __CUDA_ARCH__ - asm("trap;"); -#else - std::terminate(); -#endif + if (THRUST_IS_DEVICE_CODE) { + #if THRUST_INCLUDE_DEVICE_CODE + asm("trap;"); + #endif + } else { + #if THRUST_INCLUDE_HOST_CODE + std::terminate(); + #endif + } } __host__ __device__ @@ -166,19 +170,23 @@ inline void throw_on_error(cudaError_t status) if (cudaSuccess != status) { -#if !defined(__CUDA_ARCH__) - throw thrust::system_error(status, thrust::cuda_category()); -#else -#if __THRUST_HAS_CUDART__ - printf("Thrust CUDA backend error: %s: %s\n", - cudaGetErrorName(status), - cudaGetErrorString(status)); -#else - printf("Thrust CUDA backend error: %d\n", - static_cast(status)); -#endif - cuda_cub::terminate(); -#endif + if (THRUST_IS_HOST_CODE) { + #if THRUST_INCLUDE_HOST_CODE + throw thrust::system_error(status, thrust::cuda_category()); + #endif + } else { + #if THRUST_INCLUDE_DEVICE_CODE + #if __THRUST_HAS_CUDART__ + printf("Thrust CUDA backend error: %s: %s\n", + cudaGetErrorName(status), + cudaGetErrorString(status)); + #else + printf("Thrust CUDA backend error: %d\n", + static_cast(status)); + #endif + cuda_cub::terminate(); + #endif + } } } @@ -193,21 +201,25 @@ inline void throw_on_error(cudaError_t status, char const *msg) if (cudaSuccess != status) { -#if !defined(__CUDA_ARCH__) - throw thrust::system_error(status, thrust::cuda_category(), msg); -#else -#if __THRUST_HAS_CUDART__ - printf("Thrust CUDA backend error: %s: %s: %s\n", - cudaGetErrorName(status), - cudaGetErrorString(status), - msg); -#else - printf("Thrust CUDA backend error: %d: %s \n", - static_cast(status), - msg); -#endif - cuda_cub::terminate(); -#endif + if (THRUST_IS_HOST_CODE) { + #if THRUST_INCLUDE_HOST_CODE + throw thrust::system_error(status, thrust::cuda_category(), msg); + #endif + } else { + #if THRUST_INCLUDE_DEVICE_CODE + #if __THRUST_HAS_CUDART__ + printf("Thrust CUDA backend error: %s: %s: %s\n", + cudaGetErrorName(status), + cudaGetErrorString(status), + msg); + #else + printf("Thrust CUDA backend error: %d: %s \n", + static_cast(status), + msg); + #endif + cuda_cub::terminate(); + #endif + } } } @@ -232,6 +244,16 @@ struct transform_input_iterator_t transform_input_iterator_t(InputIt input, UnaryOp op) : input(input), op(op) {} +#ifdef __PGI_CUDA__ + // UnaryOp might not be copy assignable, such as when it is a lambda. Define + // an explicit copy assignment operator that doesn't try to assign it. + self_t& operator=(const self_t& o) + { + input = o.input; + return *this; + } +#endif + /// Postfix increment __host__ __device__ __forceinline__ self_t operator++(int) { @@ -350,6 +372,17 @@ struct transform_pair_of_input_iterators_t BinaryOp op_) : input1(input1_), input2(input2_), op(op_) {} +#ifdef __PGI_CUDA__ + // BinaryOp might not be copy assignable, such as when it is a lambda. + // Define an explicit copy assignment operator that doesn't try to assign it. + self_t& operator=(const self_t& o) + { + input1 = o.input1; + input2 = o.input2; + return *this; + } +#endif + /// Postfix increment __host__ __device__ __forceinline__ self_t operator++(int) { diff --git a/thrust/system/detail/sequential/execution_policy.h b/thrust/system/detail/sequential/execution_policy.h index 7b5f69666..81d52f140 100644 --- a/thrust/system/detail/sequential/execution_policy.h +++ b/thrust/system/detail/sequential/execution_policy.h @@ -50,7 +50,7 @@ template<> // tag's definition comes before the generic definition of execution_policy struct tag : execution_policy { - __host__ __device__ tag() {} + __host__ __device__ THRUST_CONSTEXPR tag() {} }; // allow conversion to tag when it is not a successor @@ -66,11 +66,7 @@ template }; -#ifdef __CUDA_ARCH__ -static const __device__ tag seq; -#else -static const tag seq; -#endif +THRUST_INLINE_CONSTANT tag seq; } // end sequential diff --git a/thrust/system/detail/sequential/malloc_and_free.h b/thrust/system/detail/sequential/malloc_and_free.h index a54ddf0a9..7c545250e 100644 --- a/thrust/system/detail/sequential/malloc_and_free.h +++ b/thrust/system/detail/sequential/malloc_and_free.h @@ -35,11 +35,7 @@ template inline __host__ __device__ void *malloc(execution_policy &, std::size_t n) { -#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200) return std::malloc(n); -#else - return 0; -#endif } // end mallc() @@ -47,9 +43,7 @@ template inline __host__ __device__ void free(sequential::execution_policy &, Pointer ptr) { -#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200) std::free(thrust::raw_pointer_cast(ptr)); -#endif } // end mallc() diff --git a/thrust/system/detail/sequential/stable_merge_sort.inl b/thrust/system/detail/sequential/stable_merge_sort.inl index 8ba3bf908..2939e0668 100644 --- a/thrust/system/detail/sequential/stable_merge_sort.inl +++ b/thrust/system/detail/sequential/stable_merge_sort.inl @@ -353,12 +353,16 @@ void stable_merge_sort(sequential::execution_policy &exec, RandomAccessIterator last, StrictWeakOrdering comp) { - // avoid recursion in CUDA threads -#ifdef __CUDA_ARCH__ - stable_merge_sort_detail::iterative_stable_merge_sort(exec, first, last, comp); -#else - stable_merge_sort_detail::recursive_stable_merge_sort(exec, first, last, comp); -#endif + if (THRUST_IS_DEVICE_CODE) { + #if THRUST_INCLUDE_DEVICE_CODE + // avoid recursion in CUDA threads + stable_merge_sort_detail::iterative_stable_merge_sort(exec, first, last, comp); + #endif + } else { + #if THRUST_INCLUDE_HOST_CODE + stable_merge_sort_detail::recursive_stable_merge_sort(exec, first, last, comp); + #endif + } } @@ -373,12 +377,16 @@ void stable_merge_sort_by_key(sequential::execution_policy &exec, RandomAccessIterator2 first2, StrictWeakOrdering comp) { - // avoid recursion in CUDA threads -#ifdef __CUDA_ARCH__ - stable_merge_sort_detail::iterative_stable_merge_sort_by_key(exec, first1, last1, first2, comp); -#else - stable_merge_sort_detail::recursive_stable_merge_sort_by_key(exec, first1, last1, first2, comp); -#endif + if (THRUST_IS_DEVICE_CODE) { + #if THRUST_INCLUDE_DEVICE_CODE + // avoid recursion in CUDA threads + stable_merge_sort_detail::iterative_stable_merge_sort_by_key(exec, first1, last1, first2, comp); + #endif + } else { + #if THRUST_INCLUDE_HOST_CODE + stable_merge_sort_detail::recursive_stable_merge_sort_by_key(exec, first1, last1, first2, comp); + #endif + } } diff --git a/thrust/system/detail/sequential/trivial_copy.h b/thrust/system/detail/sequential/trivial_copy.h index 77bf6dd42..6cc3d3a3b 100644 --- a/thrust/system/detail/sequential/trivial_copy.h +++ b/thrust/system/detail/sequential/trivial_copy.h @@ -40,12 +40,18 @@ __host__ __device__ std::ptrdiff_t n, T *result) { -#ifndef __CUDA_ARCH__ - std::memmove(result, first, n * sizeof(T)); - return result + n; -#else - return thrust::system::detail::sequential::general_copy_n(first, n, result); -#endif + T* return_value; + if (THRUST_IS_HOST_CODE) { + #if THRUST_INCLUDE_HOST_CODE + std::memmove(result, first, n * sizeof(T)); + return_value = result + n; + #endif + } else { + #if THRUST_INCLUDE_DEVICE_CODE + return_value = thrust::system::detail::sequential::general_copy_n(first, n, result); + #endif + } + return return_value; } // end trivial_copy_n() diff --git a/thrust/version.h b/thrust/version.h index 042592001..ef7d1d9d2 100644 --- a/thrust/version.h +++ b/thrust/version.h @@ -71,7 +71,7 @@ * \brief The preprocessor macro \p THRUST_PATCH_NUMBER encodes the * patch number of the Thrust library. */ -#define THRUST_PATCH_NUMBER 0 +#define THRUST_PATCH_NUMBER 1 // Declare these namespaces here for the purpose of Doxygenating them