Skip to content

Commit

Permalink
Regards #304: Re-establish CUDA 9.2 compatibility, mostly:
Browse files Browse the repository at this point in the history
* Corrected wrong version check in `nvtx.hpp` (needed 10000, was using 1000)
* Dropped the `unregistered_` memory type from the `memory::type_t` enum (actually, I've forgotten why we had it there in the first place)
* Defining some `kernel_t` and `context_t` methods conditionally, since they're not supported in CUDA 9.2; and I'd rather they not fail at runtime.
* Made a mode of operation in the "p2p bandwidth latency test" example program, which makes use of a CUDA 10 API call, and was not implemented in the CUDA 9.x version of the example program, unavailable when the CUDA version is under 10.0
* In CUDA 9.2 NVRTC, you can't get the address of a `__constant__`, only of a kernel. so, we disable the tests involving `__constant__` symbols.

Caveats:

* Some tests still fail; it remains to determine why.
* These changes target 9.2 compatibility, not 9.0.
  • Loading branch information
eyalroz committed Jun 20, 2022
1 parent 9658fe5 commit 6e3e5a7
Show file tree
Hide file tree
Showing 10 changed files with 43 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -137,21 +137,25 @@ void enqueue_p2p_copy(
P2PEngine p2p_mechanism,
cuda::stream_t& stream)
{
auto copy_kernel = cuda::kernel::get(stream.device(), copyp2p);
auto grid_and_block_dims = copy_kernel.min_grid_params_for_max_occupancy();
// Note: We could have alternatively used:
// auto grid_and_block_dims = cuda::kernel::occupancy::min_grid_params_for_max_occupancy(copy_kernel);
auto launch_config = cuda::make_launch_config(grid_and_block_dims);


#if CUDA_VERSION <= 10000
(void) p2paccess;
(void) p2p_mechanism;
#else
if (p2p_mechanism == SM && p2paccess)
{
auto copy_kernel = cuda::kernel::get(stream.device(), copyp2p);
auto grid_and_block_dims = copy_kernel.min_grid_params_for_max_occupancy();
// Note: We could have alternatively used:
// auto grid_and_block_dims = cuda::kernel::occupancy::min_grid_params_for_max_occupancy(copy_kernel);
auto launch_config = cuda::make_launch_config(grid_and_block_dims);

for (int r = 0; r < repeat; r++) {
stream.enqueue.kernel_launch(copy_kernel, launch_config, (int4*)dest, (int4*)src, num_elems/sizeof(int4));
}
}
else
{
#endif // CUDA_VERSION >= 10000
{
for (int r = 0; r < repeat; r++) {
// Since we assume Compute Capability >= 2.0, all devices support the
// Unified Virtual Address Space, so we don't need to use
Expand Down Expand Up @@ -520,7 +524,12 @@ command_line_options handle_command_line(int argc, char** argv)
test_p2p_read = P2P_READ;
}
if (checkCmdLineFlag(argc, (const char**) (argv), "sm_copy")) {
#if CUDA_VERSION <= 10000
std::cerr << "This mechanism is unsupported by this program before CUDA 10.0" << std::endl;
exit(EXIT_FAILURE);
#else
p2p_mechanism = SM;
#endif
}
return { test_p2p_read, p2p_mechanism };
}
Expand Down
6 changes: 6 additions & 0 deletions examples/other/jitify/jitify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,8 +326,10 @@ int main(int, char**)

bool test_simple_result = test_simple<float>();
bool test_kernels_result = test_kernels<float>();
#if CUDA_VERSION >= 10000
bool test_constant_result = test_constant();
bool test_constant_2_result = test_constant_2();
#endif // CUDA_VERSION >= 10000

// Note: There's no source-based or signature-based kernel caching mechanism - but
// you can certainly keep the modules and kernels built within the test_XXXX functions
Expand All @@ -340,14 +342,18 @@ int main(int, char**)

cout << "test_simple<float>: " << pass_or_fail(test_simple_result) << endl;
cout << "test_kernels<float>: " << pass_or_fail(test_kernels_result) << endl;
#if CUDA_VERSION >= 10000
cout << "test_constant: " << pass_or_fail(test_constant_result) << endl;
cout << "test_constant_2: " << pass_or_fail(test_constant_2_result) << endl;
#endif // CUDA_VERSION >= 10000

return not(
test_simple_result
and test_kernels_result
#if CUDA_VERSION >= 10000
and test_constant_result
and test_constant_2_result
#endif // CUDA_VERSION >= 10000
);
}

3 changes: 2 additions & 1 deletion src/cuda/api/apriori_compiled_kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,7 @@ class apriori_compiled_kernel_t final : public kernel_t {

void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const override;

#if CUDA_VERSION >= 10000
grid::complete_dimensions_t min_grid_params_for_max_occupancy(
memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
grid::block_dimension_t block_size_limit = 0,
Expand All @@ -359,7 +360,7 @@ class apriori_compiled_kernel_t final : public kernel_t {
shared_memory_size_determiner,
block_size_limit, disable_caching_override);
}

#endif

kernel::attribute_value_t get_attribute(kernel::attribute_t attribute) const override;

Expand Down
2 changes: 2 additions & 0 deletions src/cuda/api/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,7 @@ class context_t {
return context::detail_::get_limit(CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT);
}

#if CUDA_VERSION >= 10000
/**
* @return maximum granularity of fetching from the L2 cache
*
Expand All @@ -423,6 +424,7 @@ class context_t {
scoped_setter_type set_context_for_this_scope(handle_);
return context::detail_::get_limit(CU_LIMIT_MAX_L2_FETCH_GRANULARITY);
}
#endif

/**
* @brief Returns the shared memory bank size, as described in
Expand Down
6 changes: 5 additions & 1 deletion src/cuda/api/error.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,11 @@ enum named_t : ::std::underlying_type<status_t>::type {
insufficient_driver = cudaErrorInsufficientDriver,
set_on_active_process = cudaErrorSetOnActiveProcess,
invalid_surface = cudaErrorInvalidSurface,
no_device = cudaErrorNoDevice, // == 100
#if CUDA_VERSION < 10000
no_device = CUDA_ERROR_NO_DEVICE,
#else
no_device = cudaErrorNoDevice, // == 100
#endif // CUDA_VERSION < 10000
ecc_uncorrectable = cudaErrorECCUncorrectable,
shared_object_symbol_not_found = cudaErrorSharedObjectSymbolNotFound,
shared_object_init_failed = cudaErrorSharedObjectInitFailed,
Expand Down
10 changes: 6 additions & 4 deletions src/cuda/api/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ class kernel_t {
return get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
}

#if CUDA_VERSION >= 10000
/**
* @brief obtain the dimensions of a minimum grid which is expected to
* achieve maximum occupancy on the GPU the kernel is associated with.
Expand Down Expand Up @@ -206,6 +207,7 @@ class kernel_t {
grid::block_dimension_t block_size_limit = 0,
bool disable_caching_override = false) const;
///@}
#endif // CUDA_VERSION >= 10000

/**
* @brief Calculates the number of grid blocks which may be "active" on a given GPU
Expand Down Expand Up @@ -351,6 +353,7 @@ inline grid::dimension_t max_active_blocks_per_multiprocessor(
return result;
}

#if CUDA_VERSION >= 10000
// Note: If determine_shared_mem_by_block_size is not null, fixed_shared_mem_size is ignored;
// if block_size_limit is 0, it is ignored.
inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
Expand All @@ -361,9 +364,6 @@ inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
cuda::grid::block_dimension_t block_size_limit,
bool disable_caching_override)
{
#if CUDA_VERSION <= 10000
throw cuda::runtime_error {cuda::status::not_yet_implemented};
#else
int min_grid_size_in_blocks { 0 };
int block_size { 0 };
// Note: only initializing the values her because of a
Expand All @@ -382,8 +382,8 @@ inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
"Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle, device_id)
+ " with maximum occupancy given dynamic shared memory and block size data");
return { (grid::dimension_t) min_grid_size_in_blocks, (grid::block_dimension_t) block_size };
#endif // CUDA_VERSION <= 10000
}
#endif // CUDA_VERSION >= 10000

} // namespace detail_

Expand Down Expand Up @@ -436,6 +436,7 @@ inline ::std::string identify(const kernel_t& kernel)
} // namespace detail_
} // namespace kernel

#if CUDA_VERSION >= 10000
inline grid::complete_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
memory::shared::size_t dynamic_shared_memory_size,
grid::block_dimension_t block_size_limit,
Expand All @@ -457,6 +458,7 @@ inline grid::complete_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
handle(), device_id(), shared_memory_size_determiner,
no_fixed_dynamic_shared_memory_size, block_size_limit, disable_caching_override);
}
#endif // CUDA_VERSION >= 10000

inline grid::dimension_t kernel_t::max_active_blocks_per_multiprocessor(
grid::block_dimension_t block_size_in_threads,
Expand Down
6 changes: 2 additions & 4 deletions src/cuda/api/multi_wrapper_impls.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1660,6 +1660,7 @@ inline void copy_attributes(const stream_t &dest, const stream_t &src)
// Unfortunately, the CUDA runtime API does not allow for computation of the grid parameters for maximum occupancy
// from code compiled with a host-side-only compiler! See cuda_runtime.h for details

#if CUDA_VERSION >= 10000
namespace detail_ {

template <typename UnaryFunction>
Expand All @@ -1670,9 +1671,6 @@ inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
grid::block_dimension_t block_size_limit,
bool disable_caching_override)
{
#if CUDA_VERSION <= 10000
throw(cuda::runtime_error {cuda::status::not_yet_implemented});
#else
int min_grid_size_in_blocks { 0 };
int block_size { 0 };
// Note: only initializing the values her because of a
Expand All @@ -1688,7 +1686,6 @@ inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
"Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr) +
" on device " + ::std::to_string(device_id) + ".");
return { (grid::dimension_t) min_grid_size_in_blocks, (grid::block_dimension_t) block_size };
#endif // CUDA_VERSION <= 10000
}

inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
Expand Down Expand Up @@ -1726,6 +1723,7 @@ grid::complete_dimensions_t min_grid_params_for_max_occupancy(
return detail_::min_grid_params_for_max_occupancy(
kernel.ptr(), kernel.device_id(), block_size_to_dynamic_shared_mem_size, block_size_limit, disable_caching_override);
}
#endif // CUDA_VERSION >= 10000

inline kernel::attributes_t apriori_compiled_kernel_t::attributes() const
{
Expand Down
2 changes: 2 additions & 0 deletions src/cuda/api/peer_to_peer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@ namespace peer_to_peer {
constexpr const attribute_t link_performance_rank = CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK; /// A relative value indicating the performance of the link between two devices
constexpr const attribute_t access_support = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; /// 1 if access is supported, 0 otherwise
constexpr const attribute_t native_atomics_support = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED; /// 1 if the first device can perform native atomic operations on the second device, 0 otherwise
#if CUDA_VERSION >= 10000
constexpr const attribute_t array_access_support = CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED; /// 1 if special array iterpolatory access operations are supported across the link, 0 otherwise
#endif


namespace detail_ {
Expand Down
6 changes: 0 additions & 6 deletions src/cuda/api/pointer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,6 @@ enum type_t : ::std::underlying_type<CUmemorytype>::type {
array = CU_MEMORYTYPE_ARRAY,
unified_ = CU_MEMORYTYPE_UNIFIED,
managed_ = CU_MEMORYTYPE_UNIFIED, // an alias (more like the runtime API name)
#if CUDA_VERSION >= 10000
// TODO: Why doesn't the driver API have this?
// unregistered_ = cudaMemoryTypeUnregistered,
#else
unregistered_
#endif // CUDA_VERSION >= 10000
};

#if CUDA_VERSION >= 11020
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/nvtx/profiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#include <cuda_profiler_api.h>

#include <cuda/api/error.hpp>
#if CUDA_VERSION >= 1000 && defined(_WIN32)
#if CUDA_VERSION >= 10000 && defined(_WIN32)
#include <nvtx3/nvToolsExt.h>
#include <nvtx3/nvToolsExtCudaRt.h>
#else
Expand Down

0 comments on commit 6e3e5a7

Please sign in to comment.