Skip to content

Commit

Permalink
[STF] Option to disable kernel generation in CUDASTF (NVIDIA#2723)
Browse files Browse the repository at this point in the history
* Add an option to control whether we can generate CUDA kernels with parallel_For and launch
  • Loading branch information
caugonnet authored Nov 8, 2024
1 parent 8ed4a20 commit ea48e1b
Show file tree
Hide file tree
Showing 14 changed files with 120 additions and 163 deletions.
2 changes: 2 additions & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
"cudax_ENABLE_EXAMPLES": true,
"cudax_ENABLE_CUDASTF": true,
"cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
"cudax_ENABLE_CUDASTF_CODE_GENERATION": true,
"cudax_ENABLE_CUDASTF_DEBUG": false,
"cudax_ENABLE_CUDASTF_MATHLIBS": false,
"cudax_ENABLE_DIALECT_CPP17": true,
Expand Down Expand Up @@ -329,6 +330,7 @@
"cudax_ENABLE_EXAMPLES": true,
"cudax_ENABLE_CUDASTF": true,
"cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
"cudax_ENABLE_CUDASTF_CODE_GENERATION": true,
"cudax_ENABLE_CUDASTF_MATHLIBS": false,
"cudax_ENABLE_DIALECT_CPP17": false,
"cudax_ENABLE_DIALECT_CPP20": false
Expand Down
1 change: 1 addition & 0 deletions cudax/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers
option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON)
option(cudax_ENABLE_EXAMPLES "Build CUDA Experimental's examples." ON)
option(cudax_ENABLE_CUDASTF "Enable CUDASTF subproject" ON)
option(cudax_ENABLE_CUDASTF_CODE_GENERATION "Enable code generation using STF's parallel_for or launch with CUDA compiler." ON)
option(cudax_ENABLE_CUDASTF_BOUNDSCHECK "Enable bounds checks for STF targets. Requires debug build." OFF)
option(cudax_ENABLE_CUDASTF_DEBUG "Enable additional debugging for STF targets. Requires debug build." OFF)
option(cudax_ENABLE_CUDASTF_MATHLIBS "Enable STF tests/examples that use cublas/cusolver." OFF)
Expand Down
10 changes: 9 additions & 1 deletion cudax/cmake/cudaxSTFConfigureTarget.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,18 @@ function(cudax_stf_configure_target target_name)
CUDA::curand
CUDA::cuda_driver
)

if (cudax_ENABLE_CUDASTF_CODE_GENERATION)
target_compile_options(${target_name} PRIVATE
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
)
target_compile_definitions(${target_name} PRIVATE "CUDASTF_CODE_GENERATION")
endif()

target_compile_options(${target_name} PRIVATE
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>
)

set_target_properties(${target_name} PROPERTIES
CUDA_RUNTIME_LIBRARY Static
CUDA_SEPARABLE_COMPILATION ON
Expand Down
22 changes: 16 additions & 6 deletions cudax/examples/stf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,30 @@ set(stf_example_sources
01-axpy.cu
01-axpy-cuda_kernel.cu
01-axpy-cuda_kernel_chain.cu
01-axpy-parallel_for.cu
01-axpy-launch.cu
02-axpy-host_launch.cu
03-temporary-data.cu
04-fibonacci.cu
04-fibonacci-run_once.cu
08-cub-reduce.cu
axpy-annotated.cu
void_data_interface.cu
explicit_data_places.cu
thrust_zip_iterator.cu
1f1b.cu
)

# Examples which rely on code generation (parallel_for or launch)
set(stf_example_codegen_sources
01-axpy-parallel_for.cu
01-axpy-launch.cu
binary_fhe.cu
cfd.cu
custom_data_interface.cu
void_data_interface.cu
frozen_data_init.cu
graph_algorithms/degree_centrality.cu
graph_algorithms/pagerank.cu
graph_algorithms/tricount.cu
graph_algorithms/jaccard.cu
explicit_data_places.cu
fdtd_mgpu.cu
heat.cu
heat_mgpu.cu
Expand All @@ -33,9 +39,7 @@ set(stf_example_sources
scan.cu
mandelbrot.cu
standalone-launches.cu
thrust_zip_iterator.cu
word_count.cu
1f1b.cu
)

# Examples using CUBLAS, CUSOLVER...
Expand Down Expand Up @@ -110,6 +114,12 @@ foreach(cn_target IN LISTS cudax_TARGETS)
cudax_add_stf_example(example_target "${source}" ${cn_target})
endforeach()

if (cudax_ENABLE_CUDASTF_CODE_GENERATION)
foreach(source IN LISTS stf_example_codegen_sources)
cudax_add_stf_example(example_target "${source}" ${cn_target})
endforeach()
endif()

if (cudax_ENABLE_CUDASTF_MATHLIBS)
foreach(source IN LISTS stf_example_mathlib_sources)
cudax_add_stf_example(example_target "${source}" ${cn_target} LINK_MATHLIBS)
Expand Down
4 changes: 2 additions & 2 deletions cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,7 @@ UNITTEST("set_symbol on graph_task and graph_task<>")
ctx.finalize();
};

# ifdef __CUDACC__
# if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
namespace reserved
{

Expand Down Expand Up @@ -1028,7 +1028,7 @@ UNITTEST("create many graph ctxs")

} // end namespace reserved

# endif // __CUDACC__
# endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

#endif // UNITTESTED_FILE
} // end namespace cuda::experimental::stf
4 changes: 2 additions & 2 deletions cudax/include/cuda/experimental/__stf/internal/launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ namespace cuda::experimental::stf
{

// This feature requires a CUDA compiler
#ifdef __CUDACC__
#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

class stream_ctx;
template <typename...>
Expand Down Expand Up @@ -505,5 +505,5 @@ private:

} // namespace reserved

#endif // __CUDACC__
#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
} // end namespace cuda::experimental::stf
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
namespace cuda::experimental::stf
{

#ifdef __CUDACC__
#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

class stream_ctx;
class graph_ctx;
Expand Down Expand Up @@ -468,6 +468,6 @@ private:
};
} // end namespace reserved

#endif // __CUDACC__
#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

} // end namespace cuda::experimental::stf
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ private:
};

#ifdef UNITTESTED_FILE
# ifdef __CUDACC__
# if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
namespace reserved
{
template <auto... spec>
Expand Down Expand Up @@ -525,7 +525,7 @@ UNITTEST("thread hierarchy inner sync")
cuda_safe_call(cudaDeviceSynchronize());
};

# endif // __CUDACC__
# endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
#endif // UNITTESTED_FILE

} // end namespace cuda::experimental::stf
6 changes: 3 additions & 3 deletions cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1125,7 +1125,7 @@ UNITTEST("get logical_data from a task_dep")
ctx.finalize();
};

# ifdef __CUDACC__
# if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
namespace reserved
{
inline void unit_test_pfor()
Expand Down Expand Up @@ -1313,8 +1313,8 @@ UNITTEST("basic launch test")
};

} // end namespace reserved
# endif // UNITTESTED_FILE
# endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

#endif // __CUDACC__
#endif // UNITTESTED_FILE

} // namespace cuda::experimental::stf
28 changes: 6 additions & 22 deletions cudax/include/cuda/experimental/stf.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -357,18 +357,13 @@ public:
return task(default_exec_place(), mv(deps)...);
}

#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
/*
* parallel_for : apply an operation over a shaped index space
*/
template <typename S, typename... Deps>
auto parallel_for(exec_place e_place, S shape, task_dep<Deps>... deps)
{
#ifndef __CUDACC__
// We want a test that always fail, but is only triggered when the
// function is instantiated so the test relies on actual templated
// types.
static_assert(::std::is_same_v<S, ::std::false_type>, "parallel_for is only supported with CUDA compilers.");
#endif
EXPECT(payload.index() != ::std::variant_npos, "Context is not initialized.");
using result_t = unified_scope<reserved::parallel_for_scope<stream_ctx, S, null_partition, Deps...>,
reserved::parallel_for_scope<graph_ctx, S, null_partition, Deps...>>;
Expand All @@ -382,12 +377,6 @@ public:
template <typename partitioner_t, typename S, typename... Deps>
auto parallel_for(partitioner_t p, exec_place e_place, S shape, task_dep<Deps>... deps)
{
#ifndef __CUDACC__
// We want a test that always fail, but is only triggered when the
// function is instantiated so the test relies on actual templated
// types.
static_assert(::std::is_same_v<S, ::std::false_type>, "parallel_for is only supported with CUDA compilers.");
#endif
EXPECT(payload.index() != ::std::variant_npos, "Context is not initialized.");
using result_t = unified_scope<reserved::parallel_for_scope<stream_ctx, S, partitioner_t, Deps...>,
reserved::parallel_for_scope<graph_ctx, S, partitioner_t, Deps...>>;
Expand All @@ -403,6 +392,7 @@ public:
{
return parallel_for(default_exec_place(), mv(shape), mv(deps)...);
}
#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

template <typename... Deps>
auto host_launch(task_dep<Deps>... deps)
Expand Down Expand Up @@ -473,17 +463,10 @@ public:
payload);
}

#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
template <typename thread_hierarchy_spec_t, typename... Deps>
auto launch(thread_hierarchy_spec_t spec, exec_place e_place, task_dep<Deps>... deps)
{
#ifndef __CUDACC__
// We want a test that always fail, but is only triggered when the
// function is instantiated so the test relies on actual templated
// types.
static_assert(::std::is_same_v<thread_hierarchy_spec_t, ::std::false_type>,
"launch is only supported with CUDA compilers.");
#endif

using result_t = unified_scope<reserved::launch_scope<stream_ctx, thread_hierarchy_spec_t, Deps...>,
reserved::launch_scope<graph_ctx, thread_hierarchy_spec_t, Deps...>>;
return ::std::visit(
Expand Down Expand Up @@ -514,6 +497,7 @@ public:
{
return launch(mv(ths), default_exec_place(), mv(deps)...);
}
#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

auto repeat(size_t count)
{
Expand Down Expand Up @@ -802,7 +786,7 @@ UNITTEST("context with arguments")
cuda_safe_call(cudaStreamDestroy(stream));
};

# ifdef __CUDACC__
# if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
namespace reserved
{
inline void unit_test_context_pfor()
Expand Down Expand Up @@ -1251,7 +1235,7 @@ UNITTEST("unit_test_partitioner_product")
};

} // namespace reserved
# endif // __CUDACC__
# endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

UNITTEST("make_tuple_indexwise")
{
Expand Down
Loading

0 comments on commit ea48e1b

Please sign in to comment.