Regards #304: Re-establish CUDA 9.2 compatibility, mostly:

* Corrected wrong version check in `nvtx.hpp` (needed 10000, was using 1000) * Dropped the `unregistered_` memory type from the `memory::type_t` enum (actually, I've forgotten why we had it there in the first place) * Defining some `kernel_t` and `context_t` methods conditionally, since they're not supported in CUDA 9.2; and I'd rather they not fail at runtime. * Made a mode of operation in the "p2p bandwidth latency test" example program, which makes use of a CUDA 10 API call, and was not implemented in the CUDA 9.x version of the example program, unavailable when the CUDA version is under 10.0 * In CUDA 9.2 NVRTC, you can't get the address of a `__constant__`, only of a kernel. so, we disable the tests involving `__constant__` symbols. Caveats: * Some tests still fail; it remains to determine why. * These changes target 9.2 compatibility, not 9.0.
eyalroz · Jun 20, 2022 · 6e3e5a7 · 6e3e5a7
1 parent 9658fe5
commit 6e3e5a7
Show file tree

Hide file tree

Showing 10 changed files with 43 additions and 25 deletions.
diff --git a/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu b/examples/modified_cuda_samples/p2pBandwidthLatencyTest/p2pBandwidthLatencyTest.cu
@@ -137,21 +137,25 @@ void enqueue_p2p_copy(
     P2PEngine p2p_mechanism,
     cuda::stream_t& stream)
 {
-    auto copy_kernel = cuda::kernel::get(stream.device(), copyp2p);
-    auto grid_and_block_dims = copy_kernel.min_grid_params_for_max_occupancy();
-		// Note: We could have alternatively used:
-		// auto grid_and_block_dims = cuda::kernel::occupancy::min_grid_params_for_max_occupancy(copy_kernel);
-    auto launch_config = cuda::make_launch_config(grid_and_block_dims);
-
-
+#if CUDA_VERSION <= 10000
+    (void) p2paccess;
+    (void) p2p_mechanism;
+#else
     if (p2p_mechanism == SM && p2paccess)
     {
+        auto copy_kernel = cuda::kernel::get(stream.device(), copyp2p);
+        auto grid_and_block_dims = copy_kernel.min_grid_params_for_max_occupancy();
+        // Note: We could have alternatively used:
+        // auto grid_and_block_dims = cuda::kernel::occupancy::min_grid_params_for_max_occupancy(copy_kernel);
+        auto launch_config = cuda::make_launch_config(grid_and_block_dims);
+
         for (int r = 0; r < repeat; r++) {
             stream.enqueue.kernel_launch(copy_kernel, launch_config, (int4*)dest, (int4*)src, num_elems/sizeof(int4));
         }
     }
     else
-    {
+#endif // CUDA_VERSION >= 10000
+  {
         for (int r = 0; r < repeat; r++) {
         // Since we assume Compute Capability >= 2.0, all devices support the
         // Unified Virtual Address Space, so we don't need to use
@@ -520,7 +524,12 @@ command_line_options handle_command_line(int argc, char** argv)
         test_p2p_read = P2P_READ;
     }
     if (checkCmdLineFlag(argc, (const char**) (argv), "sm_copy")) {
+#if CUDA_VERSION <= 10000
+        std::cerr << "This mechanism is unsupported by this program before CUDA 10.0" << std::endl;
+        exit(EXIT_FAILURE);
+#else
         p2p_mechanism = SM;
+#endif
     }
     return { test_p2p_read, p2p_mechanism };
 }

diff --git a/examples/other/jitify/jitify.cpp b/examples/other/jitify/jitify.cpp
@@ -326,8 +326,10 @@ int main(int, char**)
 
 	bool test_simple_result = test_simple<float>();
 	bool test_kernels_result = test_kernels<float>();
+#if CUDA_VERSION >= 10000
 	bool test_constant_result = test_constant();
 	bool test_constant_2_result = test_constant_2();
+#endif // CUDA_VERSION >= 10000
 
 	// Note: There's no source-based or signature-based kernel caching mechanism - but
 	// you can certainly keep the modules and kernels built within the test_XXXX functions
@@ -340,14 +342,18 @@ int main(int, char**)
 
 	cout << "test_simple<float>:              " << pass_or_fail(test_simple_result) << endl;
 	cout << "test_kernels<float>:             " << pass_or_fail(test_kernels_result) << endl;
+#if CUDA_VERSION >= 10000
 	cout << "test_constant:                   " << pass_or_fail(test_constant_result) << endl;
 	cout << "test_constant_2:                 " << pass_or_fail(test_constant_2_result) << endl;
+#endif // CUDA_VERSION >= 10000
 
 	return not(
 		test_simple_result
 		and test_kernels_result
+#if CUDA_VERSION >= 10000
 		and test_constant_result
 		and test_constant_2_result
+#endif // CUDA_VERSION >= 10000
 	);
 }
 
diff --git a/src/cuda/api/apriori_compiled_kernel.hpp b/src/cuda/api/apriori_compiled_kernel.hpp
@@ -336,6 +336,7 @@ class apriori_compiled_kernel_t final : public kernel_t {
 
 	void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const override;
 
+#if CUDA_VERSION >= 10000
 	grid::complete_dimensions_t min_grid_params_for_max_occupancy(
 		memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
 		grid::block_dimension_t block_size_limit = 0,
@@ -359,7 +360,7 @@ class apriori_compiled_kernel_t final : public kernel_t {
             shared_memory_size_determiner,
             block_size_limit, disable_caching_override);
 	}
-
+#endif
 
 	kernel::attribute_value_t get_attribute(kernel::attribute_t attribute) const override;
 

diff --git a/src/cuda/api/context.hpp b/src/cuda/api/context.hpp
@@ -411,6 +411,7 @@ class context_t {
 		return context::detail_::get_limit(CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT);
 	}
 
+#if CUDA_VERSION >= 10000
 	/**
 	 * @return maximum granularity of fetching from the L2 cache
 	 *
@@ -423,6 +424,7 @@ class context_t {
 		scoped_setter_type set_context_for_this_scope(handle_);
 		return context::detail_::get_limit(CU_LIMIT_MAX_L2_FETCH_GRANULARITY);
 	}
+#endif
 
 	/**
 	 * @brief Returns the shared memory bank size, as described in

diff --git a/src/cuda/api/error.hpp b/src/cuda/api/error.hpp
@@ -77,7 +77,11 @@ enum named_t : ::std::underlying_type<status_t>::type {
 	insufficient_driver             = cudaErrorInsufficientDriver,
 	set_on_active_process           = cudaErrorSetOnActiveProcess,
 	invalid_surface                 = cudaErrorInvalidSurface,
-	no_device                       = cudaErrorNoDevice, // == 100
+#if CUDA_VERSION < 10000
+  no_device                       = CUDA_ERROR_NO_DEVICE,
+#else
+  no_device                       = cudaErrorNoDevice, // == 100
+#endif // CUDA_VERSION < 10000
 	ecc_uncorrectable               = cudaErrorECCUncorrectable,
 	shared_object_symbol_not_found  = cudaErrorSharedObjectSymbolNotFound,
 	shared_object_init_failed       = cudaErrorSharedObjectInitFailed,

diff --git a/src/cuda/api/kernel.hpp b/src/cuda/api/kernel.hpp
@@ -169,6 +169,7 @@ class kernel_t {
 		return get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
 	}
 
+#if CUDA_VERSION >= 10000
 	/**
 	 * @brief obtain the dimensions of a minimum grid which is expected to
 	 * achieve maximum occupancy on the GPU the kernel is associated with.
@@ -206,6 +207,7 @@ class kernel_t {
 		grid::block_dimension_t          block_size_limit = 0,
 		bool                             disable_caching_override = false) const;
     ///@}
+#endif // CUDA_VERSION >= 10000
 
     /**
      * @brief Calculates the number of grid blocks which may be "active" on a given GPU
@@ -351,6 +353,7 @@ inline grid::dimension_t max_active_blocks_per_multiprocessor(
     return result;
 }
 
+#if CUDA_VERSION >= 10000
 // Note: If determine_shared_mem_by_block_size is not null, fixed_shared_mem_size is ignored;
 // if block_size_limit is 0, it is ignored.
 inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
@@ -361,9 +364,6 @@ inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
 	cuda::grid::block_dimension_t  block_size_limit,
 	bool                           disable_caching_override)
 {
-#if CUDA_VERSION <= 10000
-	throw cuda::runtime_error {cuda::status::not_yet_implemented};
-#else
 	int min_grid_size_in_blocks { 0 };
 	int block_size { 0 };
 	// Note: only initializing the values her because of a
@@ -382,8 +382,8 @@ inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
 		"Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle, device_id)
 		+ " with maximum occupancy given dynamic shared memory and block size data");
 	return { (grid::dimension_t) min_grid_size_in_blocks, (grid::block_dimension_t) block_size };
-#endif // CUDA_VERSION <= 10000
 }
+#endif // CUDA_VERSION >= 10000
 
 } // namespace detail_
 
@@ -436,6 +436,7 @@ inline ::std::string identify(const kernel_t& kernel)
 } // namespace detail_
 } // namespace kernel
 
+#if CUDA_VERSION >= 10000
 inline grid::complete_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
 	memory::shared::size_t   dynamic_shared_memory_size,
 	grid::block_dimension_t  block_size_limit,
@@ -457,6 +458,7 @@ inline grid::complete_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
         handle(), device_id(), shared_memory_size_determiner,
         no_fixed_dynamic_shared_memory_size, block_size_limit, disable_caching_override);
 }
+#endif // CUDA_VERSION >= 10000
 
 inline grid::dimension_t kernel_t::max_active_blocks_per_multiprocessor(
     grid::block_dimension_t  block_size_in_threads,

diff --git a/src/cuda/api/multi_wrapper_impls.hpp b/src/cuda/api/multi_wrapper_impls.hpp
@@ -1660,6 +1660,7 @@ inline void copy_attributes(const stream_t &dest, const stream_t &src)
 // Unfortunately, the CUDA runtime API does not allow for computation of the grid parameters for maximum occupancy
 // from code compiled with a host-side-only compiler! See cuda_runtime.h for details
 
+#if CUDA_VERSION >= 10000
 namespace detail_ {
 
 template <typename UnaryFunction>
@@ -1670,9 +1671,6 @@ inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
 	grid::block_dimension_t  block_size_limit,
 	bool                     disable_caching_override)
 {
-#if CUDA_VERSION <= 10000
-	throw(cuda::runtime_error {cuda::status::not_yet_implemented});
-#else
 	int min_grid_size_in_blocks { 0 };
 	int block_size { 0 };
 		// Note: only initializing the values her because of a
@@ -1688,7 +1686,6 @@ inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
 		"Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr) +
 			" on device " + ::std::to_string(device_id) + ".");
 	return { (grid::dimension_t) min_grid_size_in_blocks, (grid::block_dimension_t) block_size };
-#endif // CUDA_VERSION <= 10000
 }
 
 inline grid::complete_dimensions_t min_grid_params_for_max_occupancy(
@@ -1726,6 +1723,7 @@ grid::complete_dimensions_t min_grid_params_for_max_occupancy(
 	return detail_::min_grid_params_for_max_occupancy(
 		kernel.ptr(), kernel.device_id(), block_size_to_dynamic_shared_mem_size, block_size_limit, disable_caching_override);
 }
+#endif // CUDA_VERSION >= 10000
 
 inline kernel::attributes_t apriori_compiled_kernel_t::attributes() const
 {

diff --git a/src/cuda/api/peer_to_peer.hpp b/src/cuda/api/peer_to_peer.hpp
@@ -21,7 +21,9 @@ namespace peer_to_peer {
 constexpr const attribute_t link_performance_rank = CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK; /// A relative value indicating the performance of the link between two devices
 constexpr const attribute_t	access_support = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; /// 1 if access is supported, 0 otherwise
 constexpr const attribute_t	native_atomics_support = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED; /// 1 if the first device can perform native atomic operations on the second device, 0 otherwise
+#if CUDA_VERSION >= 10000
 constexpr const attribute_t	array_access_support = CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED; /// 1 if special array iterpolatory access operations are supported across the link, 0 otherwise
+#endif
 
 
 namespace detail_ {

diff --git a/src/cuda/api/pointer.hpp b/src/cuda/api/pointer.hpp
@@ -46,12 +46,6 @@ enum type_t : ::std::underlying_type<CUmemorytype>::type {
 	array         = CU_MEMORYTYPE_ARRAY,
     unified_      = CU_MEMORYTYPE_UNIFIED,
 	managed_      = CU_MEMORYTYPE_UNIFIED, // an alias (more like the runtime API name)
-#if CUDA_VERSION >= 10000
-	// TODO: Why doesn't the driver API have this?
-    // unregistered_ = cudaMemoryTypeUnregistered,
-#else
-    unregistered_
-#endif // CUDA_VERSION >= 10000
 };
 
 #if CUDA_VERSION >= 11020

diff --git a/src/cuda/nvtx/profiling.cpp b/src/cuda/nvtx/profiling.cpp
@@ -2,7 +2,7 @@
 #include <cuda_profiler_api.h>
 
 #include <cuda/api/error.hpp>
-#if CUDA_VERSION >= 1000 && defined(_WIN32)
+#if CUDA_VERSION >= 10000 && defined(_WIN32)
 #include <nvtx3/nvToolsExt.h>
 #include <nvtx3/nvToolsExtCudaRt.h>
 #else