diff --git a/CMakePresets.json b/CMakePresets.json index b7d119840fe..1b7409a8278 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -89,6 +89,7 @@ "cudax_ENABLE_EXAMPLES": true, "cudax_ENABLE_CUDASTF": true, "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false, + "cudax_ENABLE_CUDASTF_CODE_GENERATION": true, "cudax_ENABLE_CUDASTF_DEBUG": false, "cudax_ENABLE_CUDASTF_MATHLIBS": false, "cudax_ENABLE_DIALECT_CPP17": true, @@ -329,6 +330,7 @@ "cudax_ENABLE_EXAMPLES": true, "cudax_ENABLE_CUDASTF": true, "cudax_ENABLE_CUDASTF_BOUNDSCHECK": false, + "cudax_ENABLE_CUDASTF_CODE_GENERATION": true, "cudax_ENABLE_CUDASTF_MATHLIBS": false, "cudax_ENABLE_DIALECT_CPP17": false, "cudax_ENABLE_DIALECT_CPP20": false diff --git a/cudax/CMakeLists.txt b/cudax/CMakeLists.txt index cb928391b8b..a9725331592 100644 --- a/cudax/CMakeLists.txt +++ b/cudax/CMakeLists.txt @@ -21,6 +21,7 @@ option(cudax_ENABLE_HEADER_TESTING "Test that CUDA Experimental's public headers option(cudax_ENABLE_TESTING "Build CUDA Experimental's tests." ON) option(cudax_ENABLE_EXAMPLES "Build CUDA Experimental's examples." ON) option(cudax_ENABLE_CUDASTF "Enable CUDASTF subproject" ON) +option(cudax_ENABLE_CUDASTF_CODE_GENERATION "Enable code generation using STF's parallel_for or launch with CUDA compiler." ON) option(cudax_ENABLE_CUDASTF_BOUNDSCHECK "Enable bounds checks for STF targets. Requires debug build." OFF) option(cudax_ENABLE_CUDASTF_DEBUG "Enable additional debugging for STF targets. Requires debug build." OFF) option(cudax_ENABLE_CUDASTF_MATHLIBS "Enable STF tests/examples that use cublas/cusolver." OFF) diff --git a/cudax/cmake/cudaxSTFConfigureTarget.cmake b/cudax/cmake/cudaxSTFConfigureTarget.cmake index 74765af8ff3..1d504bab969 100644 --- a/cudax/cmake/cudaxSTFConfigureTarget.cmake +++ b/cudax/cmake/cudaxSTFConfigureTarget.cmake @@ -11,10 +11,18 @@ function(cudax_stf_configure_target target_name) CUDA::curand CUDA::cuda_driver ) + + if (cudax_ENABLE_CUDASTF_CODE_GENERATION) + target_compile_options(${target_name} PRIVATE + $<$:--extended-lambda> + ) + target_compile_definitions(${target_name} PRIVATE "CUDASTF_CODE_GENERATION") + endif() + target_compile_options(${target_name} PRIVATE - $<$:--extended-lambda> $<$:--expt-relaxed-constexpr> ) + set_target_properties(${target_name} PROPERTIES CUDA_RUNTIME_LIBRARY Static CUDA_SEPARABLE_COMPILATION ON diff --git a/cudax/examples/stf/CMakeLists.txt b/cudax/examples/stf/CMakeLists.txt index 586a3fe4be4..d8953beae51 100644 --- a/cudax/examples/stf/CMakeLists.txt +++ b/cudax/examples/stf/CMakeLists.txt @@ -2,24 +2,30 @@ set(stf_example_sources 01-axpy.cu 01-axpy-cuda_kernel.cu 01-axpy-cuda_kernel_chain.cu - 01-axpy-parallel_for.cu - 01-axpy-launch.cu 02-axpy-host_launch.cu 03-temporary-data.cu 04-fibonacci.cu 04-fibonacci-run_once.cu 08-cub-reduce.cu axpy-annotated.cu + void_data_interface.cu + explicit_data_places.cu + thrust_zip_iterator.cu + 1f1b.cu +) + +# Examples which rely on code generation (parallel_for or launch) +set(stf_example_codegen_sources + 01-axpy-parallel_for.cu + 01-axpy-launch.cu binary_fhe.cu cfd.cu custom_data_interface.cu - void_data_interface.cu frozen_data_init.cu graph_algorithms/degree_centrality.cu graph_algorithms/pagerank.cu graph_algorithms/tricount.cu graph_algorithms/jaccard.cu - explicit_data_places.cu fdtd_mgpu.cu heat.cu heat_mgpu.cu @@ -33,9 +39,7 @@ set(stf_example_sources scan.cu mandelbrot.cu standalone-launches.cu - thrust_zip_iterator.cu word_count.cu - 1f1b.cu ) # Examples using CUBLAS, CUSOLVER... @@ -110,6 +114,12 @@ foreach(cn_target IN LISTS cudax_TARGETS) cudax_add_stf_example(example_target "${source}" ${cn_target}) endforeach() + if (cudax_ENABLE_CUDASTF_CODE_GENERATION) + foreach(source IN LISTS stf_example_codegen_sources) + cudax_add_stf_example(example_target "${source}" ${cn_target}) + endforeach() + endif() + if (cudax_ENABLE_CUDASTF_MATHLIBS) foreach(source IN LISTS stf_example_mathlib_sources) cudax_add_stf_example(example_target "${source}" ${cn_target} LINK_MATHLIBS) diff --git a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh index fe6b8a59f11..5e78fa336b7 100644 --- a/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/graph/graph_ctx.cuh @@ -763,7 +763,7 @@ UNITTEST("set_symbol on graph_task and graph_task<>") ctx.finalize(); }; -# ifdef __CUDACC__ +# if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) namespace reserved { @@ -1028,7 +1028,7 @@ UNITTEST("create many graph ctxs") } // end namespace reserved -# endif // __CUDACC__ +# endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) #endif // UNITTESTED_FILE } // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/launch.cuh b/cudax/include/cuda/experimental/__stf/internal/launch.cuh index 9d9abad3bf5..7d96933f39a 100644 --- a/cudax/include/cuda/experimental/__stf/internal/launch.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/launch.cuh @@ -31,7 +31,7 @@ namespace cuda::experimental::stf { // This feature requires a CUDA compiler -#ifdef __CUDACC__ +#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) class stream_ctx; template @@ -505,5 +505,5 @@ private: } // namespace reserved -#endif // __CUDACC__ +#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) } // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh index 497dac051eb..0bf646c4b90 100644 --- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -29,7 +29,7 @@ namespace cuda::experimental::stf { -#ifdef __CUDACC__ +#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) class stream_ctx; class graph_ctx; @@ -468,6 +468,6 @@ private: }; } // end namespace reserved -#endif // __CUDACC__ +#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) } // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh b/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh index cbe2e779791..99dbf07081a 100644 --- a/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh @@ -419,7 +419,7 @@ private: }; #ifdef UNITTESTED_FILE -# ifdef __CUDACC__ +# if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) namespace reserved { template @@ -525,7 +525,7 @@ UNITTEST("thread hierarchy inner sync") cuda_safe_call(cudaDeviceSynchronize()); }; -# endif // __CUDACC__ +# endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) #endif // UNITTESTED_FILE } // end namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh index c3c8ac2a82b..029fd45b824 100644 --- a/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh +++ b/cudax/include/cuda/experimental/__stf/stream/stream_ctx.cuh @@ -1125,7 +1125,7 @@ UNITTEST("get logical_data from a task_dep") ctx.finalize(); }; -# ifdef __CUDACC__ +# if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) namespace reserved { inline void unit_test_pfor() @@ -1313,8 +1313,8 @@ UNITTEST("basic launch test") }; } // end namespace reserved -# endif // UNITTESTED_FILE +# endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) -#endif // __CUDACC__ +#endif // UNITTESTED_FILE } // namespace cuda::experimental::stf diff --git a/cudax/include/cuda/experimental/stf.cuh b/cudax/include/cuda/experimental/stf.cuh index f5ce8f2e4dc..3bfdb5a743a 100644 --- a/cudax/include/cuda/experimental/stf.cuh +++ b/cudax/include/cuda/experimental/stf.cuh @@ -357,18 +357,13 @@ public: return task(default_exec_place(), mv(deps)...); } +#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) /* * parallel_for : apply an operation over a shaped index space */ template auto parallel_for(exec_place e_place, S shape, task_dep... deps) { -#ifndef __CUDACC__ - // We want a test that always fail, but is only triggered when the - // function is instantiated so the test relies on actual templated - // types. - static_assert(::std::is_same_v, "parallel_for is only supported with CUDA compilers."); -#endif EXPECT(payload.index() != ::std::variant_npos, "Context is not initialized."); using result_t = unified_scope, reserved::parallel_for_scope>; @@ -382,12 +377,6 @@ public: template auto parallel_for(partitioner_t p, exec_place e_place, S shape, task_dep... deps) { -#ifndef __CUDACC__ - // We want a test that always fail, but is only triggered when the - // function is instantiated so the test relies on actual templated - // types. - static_assert(::std::is_same_v, "parallel_for is only supported with CUDA compilers."); -#endif EXPECT(payload.index() != ::std::variant_npos, "Context is not initialized."); using result_t = unified_scope, reserved::parallel_for_scope>; @@ -403,6 +392,7 @@ public: { return parallel_for(default_exec_place(), mv(shape), mv(deps)...); } +#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) template auto host_launch(task_dep... deps) @@ -473,17 +463,10 @@ public: payload); } +#if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) template auto launch(thread_hierarchy_spec_t spec, exec_place e_place, task_dep... deps) { -#ifndef __CUDACC__ - // We want a test that always fail, but is only triggered when the - // function is instantiated so the test relies on actual templated - // types. - static_assert(::std::is_same_v, - "launch is only supported with CUDA compilers."); -#endif - using result_t = unified_scope, reserved::launch_scope>; return ::std::visit( @@ -514,6 +497,7 @@ public: { return launch(mv(ths), default_exec_place(), mv(deps)...); } +#endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) auto repeat(size_t count) { @@ -802,7 +786,7 @@ UNITTEST("context with arguments") cuda_safe_call(cudaStreamDestroy(stream)); }; -# ifdef __CUDACC__ +# if defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) namespace reserved { inline void unit_test_context_pfor() @@ -1251,7 +1235,7 @@ UNITTEST("unit_test_partitioner_product") }; } // namespace reserved -# endif // __CUDACC__ +# endif // defined(CUDASTF_CODE_GENERATION) && defined(__CUDACC__) UNITTEST("make_tuple_indexwise") { diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt index 0b238da03d3..65082348ae8 100644 --- a/cudax/test/stf/CMakeLists.txt +++ b/cudax/test/stf/CMakeLists.txt @@ -1,23 +1,13 @@ set(stf_test_sources - algorithm/graph_algorithms.cu - algorithm/in_graph_ctx.cu - algorithm/nested.cu - algorithm/algorithm_with_read.cu - allocators/adapter.cu allocators/buddy_allocator.cu - allocators/cap_tmp_buffers.cu - cpp/read_const.cu + cpp/concurrency_test.cu cpp/redundant_data.cu cpp/redundant_data_different_modes.cu - cpp/reuse_computation.cu - cpp/reuse_computation_2.cu cpp/scoped_graph_task.cu cpp/user_streams.cu - cpp/concurrency_test.cu dot/basic.cu - dot/with_events.cu dot/graph_print_to_dot.cu - tools/auto_dump/auto_dump.cu + dot/with_events.cu error_checks/ctx_mismatch.cu error_checks/data_interface_mismatch.cu error_checks/double_finalize.cu @@ -25,107 +15,120 @@ set(stf_test_sources error_checks/misformed_tasks_dbl_end.cu error_checks/misformed_tasks_dbl_start.cu error_checks/non_managed_data.cu - error_checks/slice_check_bounds.cu error_checks/uninitialized_data.cu - error_checks/unsatisfiable_spec.cu error_checks/write_frozen.cu - examples/01-axpy-launch-ranges-cg.cu - examples/01-axpy-places.cu - examples/05-stencil.cu examples/05-stencil-no-copy.cu examples/05-stencil-places.cu + examples/05-stencil.cu examples/05-stencil2d-places.cu - examples/09-nbody.cu + fhe/parse_arctyrex.cu + gnu/include_only.cpp + graph/concurrency_test.cu + graph/graph_ctx_low_level.cu + graph/static_graph_ctx.cu + hashtable/test.cu + interface/data_from_device_async.cu + interface/move_operator.cu + local_stf/legacy_to_stf.cu + places/managed.cu + places/managed_from_user.cu + places/non_current_device.cu + places/place_partition.cu + places/recursion.cu + reclaiming/graph.cu + reclaiming/graph_2.cu + reclaiming/graph_real_oom.cu + reclaiming/stream.cu + reductions/many_inc.cu + reductions/redux_test.cu + reductions/redux_test2.cu + reductions/slice2d_reduction.cu + reductions/slice_custom_op.cu + reductions/successive_reductions.cu + reductions/sum.cu + reductions/sum_array.cu + reductions/sum_multiple_places_no_refvalue.cu + slice/pinning.cu + stencil/stencil-1D.cu + stress/empty_tasks.cu + stress/empty_tasks_alloc.cu + stress/kernel_chain.cu + stress/kernel_chain_fused.cu + stress/many_read.cu + stress/task_bench.cu + threads/axpy-threads-2.cu + # threads/axpy-threads.cu + utility/timing_with_fences.cu +) + +set(stf_test_codegen_sources + algorithm/algorithm_with_read.cu + algorithm/graph_algorithms.cu + algorithm/in_graph_ctx.cu + algorithm/nested.cu + allocators/adapter.cu + allocators/cap_tmp_buffers.cu + cpp/read_const.cu + cpp/reuse_computation.cu + cpp/reuse_computation_2.cu + error_checks/slice_check_bounds.cu + error_checks/unsatisfiable_spec.cu + examples/01-axpy-launch-ranges-cg.cu + examples/01-axpy-places.cu examples/09-nbody-algorithm.cu examples/09-nbody-blocked.cu - fhe/parse_arctyrex.cu + examples/09-nbody.cu freeze/constant_logical_data.cu freeze/freeze.cu freeze/freeze_rw.cu freeze/task_fence.cu - graph/concurrency_test.cu graph/epoch.cu graph/for_each_batched.cu graph/for_each_batched_write.cu graph/freeze_for_graph.cu - graph/graph_ctx_low_level.cu graph/graph_composition.cu + graph/graph_tmp_data.cu graph/many.cu graph/multiple_graph_ctx.cu - graph/static_graph_ctx.cu - graph/graph_tmp_data.cu green_context/axpy_gc.cu green_context/cuda_graph.cu green_context/gc_grid.cu - gnu/include_only.cpp - hash/logical_data.cu hash/ctx_hash.cu + hash/logical_data.cu hashtable/fusion.cu hashtable/fusion_reduction.cu - hashtable/test.cu hashtable/parallel_for.cu hashtable/parallel_for_shape.cu - interface/mix_stream_and_graph.cu - interface/mix_stream_and_graph_2.cu interface/data_from_device.cu interface/data_from_device_2.cu interface/data_from_device_wb.cu - interface/data_from_device_async.cu interface/graph_use_device_data.cu - interface/stream_add_callback.cu - interface/move_operator.cu + interface/mix_stream_and_graph.cu + interface/mix_stream_and_graph_2.cu interface/scal.cu interface/scalar_div.cu + interface/stream_add_callback.cu local_stf/interop_cuda.cu - local_stf/legacy_to_stf.cu loop_dispatch/dispatch_on_streams.cu - loop_dispatch/nested_loop_dispatch.cu loop_dispatch/loop_dispatch.cu + loop_dispatch/nested_loop_dispatch.cu + parallel_for/fdtd.cu + parallel_for/parallel_for_all_devs.cu + parallel_for/parallel_for_box.cu parallel_for/parallel_for_repeat.cu - parallel_for/test_parallel_for.cu parallel_for/test2_parallel_for_context.cu - parallel_for/parallel_for_box.cu - parallel_for/parallel_for_all_devs.cu - parallel_for/fdtd.cu + parallel_for/test_parallel_for.cu parallel_for/tiled_loops.cu places/cuda_stream_place.cu - places/managed.cu places/managed_from_shape.cu - places/managed_from_user.cu - places/non_current_device.cu - places/place_partition.cu - places/recursion.cu - reclaiming/graph.cu - reclaiming/graph_2.cu - reclaiming/graph_real_oom.cu - reclaiming/stream.cu - reductions/many_inc.cu - reductions/redux_test.cu - reductions/redux_test2.cu - reductions/slice2d_reduction.cu - reductions/slice_custom_op.cu - reductions/successive_reductions.cu reductions/successive_reductions_pfor.cu - reductions/sum.cu - reductions/sum_array.cu reductions/sum_multiple_places.cu - reductions/sum_multiple_places_no_refvalue.cu reductions/write_back_after_redux.cu - slice/pinning.cu - stencil/stencil-1D.cu - stress/empty_tasks.cu - stress/empty_tasks_alloc.cu - stress/kernel_chain.cu - stress/kernel_chain_fused.cu stress/launch_overhead.cu stress/launch_vs_parallelfor.cu - stress/many_read.cu stress/parallel_for_overhead.cu - stress/task_bench.cu - # threads/axpy-threads.cu - threads/axpy-threads-2.cu # threads/axpy-threads-pfor.cu # Currently has a difficult-to-reproduce concurrency problem - utility/timing_with_fences.cu + tools/auto_dump/auto_dump.cu ) # Examples using CUBLAS, CUSOLVER... @@ -266,6 +269,12 @@ foreach(cn_target IN LISTS cudax_TARGETS) cudax_add_stf_test(test_target "${source}" ${cn_target}) endforeach() + if (cudax_ENABLE_CUDASTF_CODE_GENERATION) + foreach(source IN LISTS stf_test_codegen_sources) + cudax_add_stf_test(test_target "${source}" ${cn_target}) + endforeach() + endif() + # Tests with mathlib deps: if (cudax_ENABLE_CUDASTF_MATHLIBS) foreach(source IN LISTS stf_test_mathlib_sources) diff --git a/cudax/test/stf/static_error_checks/CMakeLists.txt b/cudax/test/stf/static_error_checks/CMakeLists.txt index 576cb992b68..8e055cb0430 100644 --- a/cudax/test/stf/static_error_checks/CMakeLists.txt +++ b/cudax/test/stf/static_error_checks/CMakeLists.txt @@ -1,6 +1,4 @@ set(static_assert_tests - gnu_launch.cpp - gnu_parallel_for.cpp static.cu task_prototype.cu stream_task_prototype.cu diff --git a/cudax/test/stf/static_error_checks/gnu_launch.cpp b/cudax/test/stf/static_error_checks/gnu_launch.cpp deleted file mode 100644 index 8c39ad6d4d9..00000000000 --- a/cudax/test/stf/static_error_checks/gnu_launch.cpp +++ /dev/null @@ -1,27 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDASTF in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#include - -using namespace cuda::experimental::stf; - -int main() -{ - context ctx; - - auto A = ctx.logical_data(size_t(128)); - - // We cannot do a launch on a device lambda without a CUDA compiler - // as we normally expect an extended lambda (which is also missing here) - ctx.launch(par(128), A.write())->*[](auto th, auto A) { A(th.rank(i) = 0; - }; - - ctx.finalize(); -} diff --git a/cudax/test/stf/static_error_checks/gnu_parallel_for.cpp b/cudax/test/stf/static_error_checks/gnu_parallel_for.cpp deleted file mode 100644 index fcc485d01de..00000000000 --- a/cudax/test/stf/static_error_checks/gnu_parallel_for.cpp +++ /dev/null @@ -1,28 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of CUDASTF in CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#include - -using namespace cuda::experimental::stf; - -int main() -{ - context ctx; - - auto A = ctx.logical_data(size_t(1)); - - // We cannot do a parallel for on a device lambda without a CUDA compiler - // as we normally expect an extended lambda (which is also missing here) - ctx.parallel_for(A.shape(), A.write())->*[](size_t i, auto A) { - A(i) = 0; - }; - - ctx.finalize(); -}