[STF] Option to disable kernel generation in CUDASTF (NVIDIA#2723)

* Add an option to control whether we can generate CUDA kernels with parallel_For and launch
fbusato · Nov 8, 2024 · ea48e1b · ea48e1b
1 parent 8ed4a20
commit ea48e1b
Show file tree

Hide file tree

Showing 14 changed files with 120 additions and 163 deletions.
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -89,6 +89,7 @@
         "cudax_ENABLE_EXAMPLES": true,
         "cudax_ENABLE_CUDASTF": true,
         "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
+        "cudax_ENABLE_CUDASTF_CODE_GENERATION": true,
         "cudax_ENABLE_CUDASTF_DEBUG": false,
         "cudax_ENABLE_CUDASTF_MATHLIBS": false,
         "cudax_ENABLE_DIALECT_CPP17": true,
@@ -329,6 +330,7 @@
         "cudax_ENABLE_EXAMPLES": true,
         "cudax_ENABLE_CUDASTF": true,
         "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
+        "cudax_ENABLE_CUDASTF_CODE_GENERATION": true,
         "cudax_ENABLE_CUDASTF_MATHLIBS": false,
         "cudax_ENABLE_DIALECT_CPP17": false,
         "cudax_ENABLE_DIALECT_CPP20": false

diff --git a/cudax/CMakeLists.txt b/cudax/CMakeLists.txt
@@ -21,6 +21,7 @@ option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers
 option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON)
 option(cudax_ENABLE_EXAMPLES "Build CUDA Experimental's examples." ON)
 option(cudax_ENABLE_CUDASTF "Enable CUDASTF subproject" ON)
+option(cudax_ENABLE_CUDASTF_CODE_GENERATION "Enable code generation using STF's parallel_for or launch with CUDA compiler." ON)
 option(cudax_ENABLE_CUDASTF_BOUNDSCHECK "Enable bounds checks for STF targets. Requires debug build." OFF)
 option(cudax_ENABLE_CUDASTF_DEBUG "Enable additional debugging for STF targets. Requires debug build." OFF)
 option(cudax_ENABLE_CUDASTF_MATHLIBS "Enable STF tests/examples that use cublas/cusolver." OFF)

diff --git a/cudax/cmake/cudaxSTFConfigureTarget.cmake b/cudax/cmake/cudaxSTFConfigureTarget.cmake
@@ -11,10 +11,18 @@ function(cudax_stf_configure_target target_name)
     CUDA::curand
     CUDA::cuda_driver
   )
+
+  if (cudax_ENABLE_CUDASTF_CODE_GENERATION)
+    target_compile_options(${target_name} PRIVATE
+      $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
+    )
+    target_compile_definitions(${target_name} PRIVATE "CUDASTF_CODE_GENERATION")
+  endif()
+
   target_compile_options(${target_name} PRIVATE
-    $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
     $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>
   )
+
   set_target_properties(${target_name} PROPERTIES
     CUDA_RUNTIME_LIBRARY Static
     CUDA_SEPARABLE_COMPILATION ON

diff --git a/cudax/examples/stf/CMakeLists.txt b/cudax/examples/stf/CMakeLists.txt
@@ -2,24 +2,30 @@ set(stf_example_sources
   01-axpy.cu
   01-axpy-cuda_kernel.cu
   01-axpy-cuda_kernel_chain.cu
-  01-axpy-parallel_for.cu
-  01-axpy-launch.cu
   02-axpy-host_launch.cu
   03-temporary-data.cu
   04-fibonacci.cu
   04-fibonacci-run_once.cu
   08-cub-reduce.cu
   axpy-annotated.cu
+  void_data_interface.cu
+  explicit_data_places.cu
+  thrust_zip_iterator.cu
+  1f1b.cu
+)
+
+# Examples which rely on code generation (parallel_for or launch)
+set(stf_example_codegen_sources
+  01-axpy-parallel_for.cu
+  01-axpy-launch.cu
   binary_fhe.cu
   cfd.cu
   custom_data_interface.cu
-  void_data_interface.cu
   frozen_data_init.cu
   graph_algorithms/degree_centrality.cu
   graph_algorithms/pagerank.cu
   graph_algorithms/tricount.cu
   graph_algorithms/jaccard.cu
-  explicit_data_places.cu
   fdtd_mgpu.cu
   heat.cu
   heat_mgpu.cu
@@ -33,9 +39,7 @@ set(stf_example_sources
   scan.cu
   mandelbrot.cu
   standalone-launches.cu
-  thrust_zip_iterator.cu
   word_count.cu
-  1f1b.cu
 )
 
 # Examples using CUBLAS, CUSOLVER...
@@ -110,6 +114,12 @@ foreach(cn_target IN LISTS cudax_TARGETS)
     cudax_add_stf_example(example_target "${source}" ${cn_target})
   endforeach()
 
+  if (cudax_ENABLE_CUDASTF_CODE_GENERATION)
+     foreach(source IN LISTS stf_example_codegen_sources)
+         cudax_add_stf_example(example_target "${source}" ${cn_target})
+     endforeach()
+  endif()
+
   if (cudax_ENABLE_CUDASTF_MATHLIBS)
     foreach(source IN LISTS stf_example_mathlib_sources)
       cudax_add_stf_example(example_target "${source}" ${cn_target} LINK_MATHLIBS)

diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
@@ -763,7 +763,7 @@ UNITTEST("set_symbol on graph_task and graph_task<>")
   ctx.finalize();
 };
 
-#  ifdef __CUDACC__
+#  if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 namespace reserved
 {
 
@@ -1028,7 +1028,7 @@ UNITTEST("create many graph ctxs")
 
 } // end namespace reserved
 
-#  endif // __CUDACC__
+#  endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 
 #endif // UNITTESTED_FILE
 } // end namespace cuda::experimental::stf
diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh
@@ -31,7 +31,7 @@ namespace cuda::experimental::stf
 {
 
 // This feature requires a CUDA compiler
-#ifdef __CUDACC__
+#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 
 class stream_ctx;
 template <typename...>
@@ -505,5 +505,5 @@ private:
 
 } // namespace reserved
 
-#endif // __CUDACC__
+#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 } // end namespace cuda::experimental::stf
diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -29,7 +29,7 @@
 namespace cuda::experimental::stf
 {
 
-#ifdef __CUDACC__
+#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 
 class stream_ctx;
 class graph_ctx;
@@ -468,6 +468,6 @@ private:
 };
 } // end namespace reserved
 
-#endif // __CUDACC__
+#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 
 } // end namespace cuda::experimental::stf
diff --git a/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh b/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh
@@ -419,7 +419,7 @@ private:
 };
 
 #ifdef UNITTESTED_FILE
-#  ifdef __CUDACC__
+#  if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 namespace reserved
 {
 template <auto... spec>
@@ -525,7 +525,7 @@ UNITTEST("thread hierarchy inner sync")
   cuda_safe_call(cudaDeviceSynchronize());
 };
 
-#  endif // __CUDACC__
+#  endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 #endif // UNITTESTED_FILE
 
 } // end namespace cuda::experimental::stf
diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh
@@ -1125,7 +1125,7 @@ UNITTEST("get logical_data from a task_dep")
   ctx.finalize();
 };
 
-#  ifdef __CUDACC__
+#  if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 namespace reserved
 {
 inline void unit_test_pfor()
@@ -1313,8 +1313,8 @@ UNITTEST("basic launch test")
 };
 
 } // end namespace reserved
-#  endif // UNITTESTED_FILE
+#  endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 
-#endif // __CUDACC__
+#endif // UNITTESTED_FILE
 
 } // namespace cuda::experimental::stf
diff --git a/cudax/include/cuda/experimental/stf.cuh b/cudax/include/cuda/experimental/stf.cuh
@@ -357,18 +357,13 @@ public:
     return task(default_exec_place(), mv(deps)...);
   }
 
+#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
   /*
    * parallel_for : apply an operation over a shaped index space
    */
   template <typename S, typename... Deps>
   auto parallel_for(exec_place e_place, S shape, task_dep<Deps>... deps)
   {
-#ifndef __CUDACC__
-    // We want a test that always fail, but is only triggered when the
-    // function is instantiated so the test relies on actual templated
-    // types.
-    static_assert(::std::is_same_v<S, ::std::false_type>, "parallel_for is only supported with CUDA compilers.");
-#endif
     EXPECT(payload.index() != ::std::variant_npos, "Context is not initialized.");
     using result_t = unified_scope<reserved::parallel_for_scope<stream_ctx, S, null_partition, Deps...>,
                                    reserved::parallel_for_scope<graph_ctx, S, null_partition, Deps...>>;
@@ -382,12 +377,6 @@ public:
   template <typename partitioner_t, typename S, typename... Deps>
   auto parallel_for(partitioner_t p, exec_place e_place, S shape, task_dep<Deps>... deps)
   {
-#ifndef __CUDACC__
-    // We want a test that always fail, but is only triggered when the
-    // function is instantiated so the test relies on actual templated
-    // types.
-    static_assert(::std::is_same_v<S, ::std::false_type>, "parallel_for is only supported with CUDA compilers.");
-#endif
     EXPECT(payload.index() != ::std::variant_npos, "Context is not initialized.");
     using result_t = unified_scope<reserved::parallel_for_scope<stream_ctx, S, partitioner_t, Deps...>,
                                    reserved::parallel_for_scope<graph_ctx, S, partitioner_t, Deps...>>;
@@ -403,6 +392,7 @@ public:
   {
     return parallel_for(default_exec_place(), mv(shape), mv(deps)...);
   }
+#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 
   template <typename... Deps>
   auto host_launch(task_dep<Deps>... deps)
@@ -473,17 +463,10 @@ public:
       payload);
   }
 
+#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
   template <typename thread_hierarchy_spec_t, typename... Deps>
   auto launch(thread_hierarchy_spec_t spec, exec_place e_place, task_dep<Deps>... deps)
   {
-#ifndef __CUDACC__
-    // We want a test that always fail, but is only triggered when the
-    // function is instantiated so the test relies on actual templated
-    // types.
-    static_assert(::std::is_same_v<thread_hierarchy_spec_t, ::std::false_type>,
-                  "launch is only supported with CUDA compilers.");
-#endif
-
     using result_t = unified_scope<reserved::launch_scope<stream_ctx, thread_hierarchy_spec_t, Deps...>,
                                    reserved::launch_scope<graph_ctx, thread_hierarchy_spec_t, Deps...>>;
     return ::std::visit(
@@ -514,6 +497,7 @@ public:
   {
     return launch(mv(ths), default_exec_place(), mv(deps)...);
   }
+#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 
   auto repeat(size_t count)
   {
@@ -802,7 +786,7 @@ UNITTEST("context with arguments")
   cuda_safe_call(cudaStreamDestroy(stream));
 };
 
-#  ifdef __CUDACC__
+#  if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 namespace reserved
 {
 inline void unit_test_context_pfor()
@@ -1251,7 +1235,7 @@ UNITTEST("unit_test_partitioner_product")
 };
 
 } // namespace reserved
-#  endif // __CUDACC__
+#  endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
 
 UNITTEST("make_tuple_indexwise")
 {