Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[STF] Option to disable kernel generation in CUDASTF #2723

Merged
merged 12 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
"cudax_ENABLE_EXAMPLES": true,
"cudax_ENABLE_CUDASTF": true,
"cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
"cudax_ENABLE_CUDASTF_CODE_GENERATION": true,
"cudax_ENABLE_CUDASTF_DEBUG": false,
"cudax_ENABLE_CUDASTF_MATHLIBS": false,
"cudax_ENABLE_DIALECT_CPP17": true,
Expand Down Expand Up @@ -329,6 +330,7 @@
"cudax_ENABLE_EXAMPLES": true,
"cudax_ENABLE_CUDASTF": true,
"cudax_ENABLE_CUDASTF_BOUNDSCHECK": false,
"cudax_ENABLE_CUDASTF_CODE_GENERATION": true,
"cudax_ENABLE_CUDASTF_MATHLIBS": false,
"cudax_ENABLE_DIALECT_CPP17": false,
"cudax_ENABLE_DIALECT_CPP20": false
Expand Down
1 change: 1 addition & 0 deletions cudax/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers
option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON)
option(cudax_ENABLE_EXAMPLES "Build CUDA Experimental's examples." ON)
option(cudax_ENABLE_CUDASTF "Enable CUDASTF subproject" ON)
option(cudax_ENABLE_CUDASTF_CODE_GENERATION "Enable code generation using STF's parallel_for or launch with CUDA compiler." ON)
caugonnet marked this conversation as resolved.
Show resolved Hide resolved
option(cudax_ENABLE_CUDASTF_BOUNDSCHECK "Enable bounds checks for STF targets. Requires debug build." OFF)
option(cudax_ENABLE_CUDASTF_DEBUG "Enable additional debugging for STF targets. Requires debug build." OFF)
option(cudax_ENABLE_CUDASTF_MATHLIBS "Enable STF tests/examples that use cublas/cusolver." OFF)
Expand Down
10 changes: 9 additions & 1 deletion cudax/cmake/cudaxSTFConfigureTarget.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,18 @@ function(cudax_stf_configure_target target_name)
CUDA::curand
CUDA::cuda_driver
)

if (cudax_ENABLE_CUDASTF_CODE_GENERATION)
target_compile_options(${target_name} PRIVATE
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
)
caugonnet marked this conversation as resolved.
Show resolved Hide resolved
target_compile_definitions(${target_name} PRIVATE "CUDASTF_CODE_GENERATION")
endif()

target_compile_options(${target_name} PRIVATE
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>
$<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--expt-relaxed-constexpr>
)

set_target_properties(${target_name} PROPERTIES
CUDA_RUNTIME_LIBRARY Static
CUDA_SEPARABLE_COMPILATION ON
Expand Down
22 changes: 16 additions & 6 deletions cudax/examples/stf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,30 @@ set(stf_example_sources
01-axpy.cu
01-axpy-cuda_kernel.cu
01-axpy-cuda_kernel_chain.cu
01-axpy-parallel_for.cu
01-axpy-launch.cu
02-axpy-host_launch.cu
03-temporary-data.cu
04-fibonacci.cu
04-fibonacci-run_once.cu
08-cub-reduce.cu
axpy-annotated.cu
void_data_interface.cu
explicit_data_places.cu
thrust_zip_iterator.cu
1f1b.cu
)

# Examples which rely on code generation (parallel_for or launch)
set(stf_example_codegen_sources
01-axpy-parallel_for.cu
01-axpy-launch.cu
binary_fhe.cu
cfd.cu
custom_data_interface.cu
void_data_interface.cu
frozen_data_init.cu
graph_algorithms/degree_centrality.cu
graph_algorithms/pagerank.cu
graph_algorithms/tricount.cu
graph_algorithms/jaccard.cu
explicit_data_places.cu
fdtd_mgpu.cu
heat.cu
heat_mgpu.cu
Expand All @@ -33,9 +39,7 @@ set(stf_example_sources
scan.cu
mandelbrot.cu
standalone-launches.cu
thrust_zip_iterator.cu
word_count.cu
1f1b.cu
)

# Examples using CUBLAS, CUSOLVER...
Expand Down Expand Up @@ -110,6 +114,12 @@ foreach(cn_target IN LISTS cudax_TARGETS)
cudax_add_stf_example(example_target "${source}" ${cn_target})
endforeach()

if (cudax_ENABLE_CUDASTF_CODE_GENERATION)
foreach(source IN LISTS stf_example_codegen_sources)
cudax_add_stf_example(example_target "${source}" ${cn_target})
endforeach()
endif()

if (cudax_ENABLE_CUDASTF_MATHLIBS)
foreach(source IN LISTS stf_example_mathlib_sources)
cudax_add_stf_example(example_target "${source}" ${cn_target} LINK_MATHLIBS)
Expand Down
4 changes: 2 additions & 2 deletions cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -763,7 +763,7 @@ UNITTEST("set_symbol on graph_task and graph_task<>")
ctx.finalize();
};

# ifdef __CUDACC__
# if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
miscco marked this conversation as resolved.
Show resolved Hide resolved
namespace reserved
{

Expand Down Expand Up @@ -1028,7 +1028,7 @@ UNITTEST("create many graph ctxs")

} // end namespace reserved

# endif // __CUDACC__
# endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

#endif // UNITTESTED_FILE
} // end namespace cuda::experimental::stf
4 changes: 2 additions & 2 deletions cudax/include/cuda/experimental/__stf/internal/launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ namespace cuda::experimental::stf
{

// This feature requires a CUDA compiler
#ifdef __CUDACC__
#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

class stream_ctx;
template <typename...>
Expand Down Expand Up @@ -505,5 +505,5 @@ private:

} // namespace reserved

#endif // __CUDACC__
#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
} // end namespace cuda::experimental::stf
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
namespace cuda::experimental::stf
{

#ifdef __CUDACC__
#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

class stream_ctx;
class graph_ctx;
Expand Down Expand Up @@ -468,6 +468,6 @@ private:
};
} // end namespace reserved

#endif // __CUDACC__
#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

} // end namespace cuda::experimental::stf
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ private:
};

#ifdef UNITTESTED_FILE
# ifdef __CUDACC__
# if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
namespace reserved
{
template <auto... spec>
Expand Down Expand Up @@ -525,7 +525,7 @@ UNITTEST("thread hierarchy inner sync")
cuda_safe_call(cudaDeviceSynchronize());
};

# endif // __CUDACC__
# endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
#endif // UNITTESTED_FILE

} // end namespace cuda::experimental::stf
6 changes: 3 additions & 3 deletions cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1125,7 +1125,7 @@ UNITTEST("get logical_data from a task_dep")
ctx.finalize();
};

# ifdef __CUDACC__
# if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
namespace reserved
{
inline void unit_test_pfor()
Expand Down Expand Up @@ -1313,8 +1313,8 @@ UNITTEST("basic launch test")
};

} // end namespace reserved
# endif // UNITTESTED_FILE
# endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

#endif // __CUDACC__
#endif // UNITTESTED_FILE

} // namespace cuda::experimental::stf
28 changes: 6 additions & 22 deletions cudax/include/cuda/experimental/stf.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -357,18 +357,13 @@ public:
return task(default_exec_place(), mv(deps)...);
}

#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
/*
* parallel_for : apply an operation over a shaped index space
*/
template <typename S, typename... Deps>
auto parallel_for(exec_place e_place, S shape, task_dep<Deps>... deps)
{
#ifndef __CUDACC__
// We want a test that always fail, but is only triggered when the
// function is instantiated so the test relies on actual templated
// types.
static_assert(::std::is_same_v<S, ::std::false_type>, "parallel_for is only supported with CUDA compilers.");
#endif
EXPECT(payload.index() != ::std::variant_npos, "Context is not initialized.");
using result_t = unified_scope<reserved::parallel_for_scope<stream_ctx, S, null_partition, Deps...>,
reserved::parallel_for_scope<graph_ctx, S, null_partition, Deps...>>;
Expand All @@ -382,12 +377,6 @@ public:
template <typename partitioner_t, typename S, typename... Deps>
auto parallel_for(partitioner_t p, exec_place e_place, S shape, task_dep<Deps>... deps)
{
#ifndef __CUDACC__
// We want a test that always fail, but is only triggered when the
// function is instantiated so the test relies on actual templated
// types.
static_assert(::std::is_same_v<S, ::std::false_type>, "parallel_for is only supported with CUDA compilers.");
#endif
EXPECT(payload.index() != ::std::variant_npos, "Context is not initialized.");
using result_t = unified_scope<reserved::parallel_for_scope<stream_ctx, S, partitioner_t, Deps...>,
reserved::parallel_for_scope<graph_ctx, S, partitioner_t, Deps...>>;
Expand All @@ -403,6 +392,7 @@ public:
{
return parallel_for(default_exec_place(), mv(shape), mv(deps)...);
}
#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

template <typename... Deps>
auto host_launch(task_dep<Deps>... deps)
Expand Down Expand Up @@ -473,17 +463,10 @@ public:
payload);
}

#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
caugonnet marked this conversation as resolved.
Show resolved Hide resolved
template <typename thread_hierarchy_spec_t, typename... Deps>
auto launch(thread_hierarchy_spec_t spec, exec_place e_place, task_dep<Deps>... deps)
{
#ifndef __CUDACC__
// We want a test that always fail, but is only triggered when the
// function is instantiated so the test relies on actual templated
// types.
static_assert(::std::is_same_v<thread_hierarchy_spec_t, ::std::false_type>,
"launch is only supported with CUDA compilers.");
#endif

using result_t = unified_scope<reserved::launch_scope<stream_ctx, thread_hierarchy_spec_t, Deps...>,
reserved::launch_scope<graph_ctx, thread_hierarchy_spec_t, Deps...>>;
return ::std::visit(
Expand Down Expand Up @@ -514,6 +497,7 @@ public:
{
return launch(mv(ths), default_exec_place(), mv(deps)...);
}
#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

auto repeat(size_t count)
{
Expand Down Expand Up @@ -802,7 +786,7 @@ UNITTEST("context with arguments")
cuda_safe_call(cudaStreamDestroy(stream));
};

# ifdef __CUDACC__
# if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)
namespace reserved
{
inline void unit_test_context_pfor()
Expand Down Expand Up @@ -1251,7 +1235,7 @@ UNITTEST("unit_test_partitioner_product")
};

} // namespace reserved
# endif // __CUDACC__
# endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__)

UNITTEST("make_tuple_indexwise")
{
Expand Down
Loading
Loading