Skip to content

Commit

Permalink
Fixes #473, regards #304: Host-function launching-related changes:
Browse files Browse the repository at this point in the history
* No longer allocating heap memory on enqueue and releaing it during launch - only passing pointers the user has provided. Part of the motivation for this is enabling stream capture and re-execution of the launch.
* Separated a method for enqueuing no-argument callables and enqueuing functions which take a single (pointer) argument.
* Enqueued callables no longer receive a stream (as CUDA has moved away from this convention and we can't make it happen without the heap allocation scheme we had before
* `#ifdef`'ed out parts of `launch_config_builder.hpp` which require CUDA 10.0 to run (essentially obtaining minimum dimensions for maximum occupancy).
* Dropped some redundant comments in `stream.hpp` about the choice of API functions
  • Loading branch information
eyalroz authored and Eyal Rozenberg committed Mar 9, 2023
1 parent 30fa9af commit 86f6d45
Show file tree
Hide file tree
Showing 9 changed files with 110 additions and 117 deletions.
7 changes: 2 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,8 @@ The [Milestones](https://github.com/eyalroz/cuda-api-wrappers/milestones) indica

We've all dreamed of being able to type in:

my_stream.enqueue.callback(
[&foo](cuda::stream_t stream, cuda::status_t status) {
::std::cout << "Hello " << foo << " world!\n";
}
);
auto callback = [&foo] { ::std::cout << "Hello " << foo << " world!\n"; }
my_stream.enqueue.host_invokable(callback);

... and have that just work, right? Well, now it does!

Expand Down
19 changes: 9 additions & 10 deletions examples/by_runtime_api_module/event_management.cu
Original file line number Diff line number Diff line change
Expand Up @@ -106,17 +106,16 @@ int main(int argc, char **argv)

stream.enqueue.kernel_launch(print_message<N,1>, { 1, 1 }, message<N>("I am launched before the first event"));
stream.enqueue.event(event_1);
stream.enqueue.host_function_call(
[&event_1, &event_2](const cuda::stream_t&) {
report_occurrence("In first callback (enqueued after first event but before first kernel)", event_1, event_2);
}
);
auto first_callback = [&] {
report_occurrence("In first callback (enqueued after first event but before first kernel)", event_1, event_2);
};
stream.enqueue.host_invokable(first_callback);
stream.enqueue.kernel_launch(increment, launch_config, buffer.get(), buffer_size);
stream.enqueue.host_function_call(
[&event_1, &event_2](const cuda::stream_t& ) {
report_occurrence("In second callback (enqueued after the first kernel but before the second event)", event_1, event_2);
}
);
auto second_callback = [&] {
report_occurrence("In second callback (enqueued after the first kernel but before the second event)",
event_1, event_2);
};
stream.enqueue.host_invokable(second_callback);
stream.enqueue.event(event_2);
stream.enqueue.kernel_launch(print_message<N,3>, { 1, 1 }, message<N>("I am launched after the second event"));
stream.enqueue.event(event_3);
Expand Down
6 changes: 3 additions & 3 deletions examples/by_runtime_api_module/execution_control.cu
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,9 @@ int main(int argc, char **argv)
launch_config_4 = launch_config_2;
launch_config_4 = ::std::move(launch_config_3);
[[maybe_unused]] cuda::launch_configuration_t launch_config_5{::std::move(launch_config_2)};
// In case the `[[maybe_unused]]` attribute is ignored, let's try to trick the compiler
// into thinking we're actually using launch_config_4.
launch_config_4.dimensions == launch_config.dimensions;
// In case the `[[maybe_unused]]` attribute is ignored, let's try to trick the compiler
// into thinking we're actually using launch_config_4.
launch_config_4.dimensions == launch_config.dimensions;
}

cuda::launch(kernel_function, launch_config, bar);
Expand Down
11 changes: 5 additions & 6 deletions examples/by_runtime_api_module/stream_management.cu
Original file line number Diff line number Diff line change
Expand Up @@ -166,12 +166,11 @@ int main(int argc, char **argv)
auto event_1 = cuda::event::create(device, cuda::event::sync_by_blocking);
stream_1.enqueue.kernel_launch(print_message<N,3>, single_thread_config, message<N>("I'm on stream 1"));
stream_1.enqueue.memset(buffer.get(), 'b', buffer_size);
stream_1.enqueue.host_function_call(
[&buffer](cuda::stream_t) {
::std::cout << "Callback from stream 1!... \n";
print_first_char(buffer.get());
}
);
auto callback = [&]() {
::std::cout << "Callback from stream 1!... \n";
print_first_char(buffer.get());
};
stream_1.enqueue.host_invokable(callback);
auto threads_per_block = cuda::kernel::get(device, increment).get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
auto num_blocks = div_rounding_up(buffer_size, threads_per_block);
auto launch_config = cuda::make_launch_config(num_blocks, threads_per_block);
Expand Down
10 changes: 4 additions & 6 deletions examples/other/io_compute_overlap_with_streams.cu
Original file line number Diff line number Diff line change
Expand Up @@ -135,12 +135,10 @@ int main(int, char **)
buffer_set.device_result.get(),
num_elements);
stream.enqueue.copy(buffer_set.host_result.get(), buffer_set.device_result.get(), buffer_size);
stream.enqueue.host_function_call(
[=](cuda::stream_t) {
::std::cout
<< "Stream " << k+1 << " of " << num_kernels << " has concluded all work. " << ::std::endl;
}
);
auto callback = [=] {
::std::cout << "Stream " << k+1 << " of " << num_kernels << " has concluded all work. " << ::std::endl;
};
stream.enqueue.host_invokable(callback);
}
::std::this_thread::sleep_for(::std::chrono::microseconds(50000));
for(auto& stream : streams) { stream.synchronize(); }
Expand Down
10 changes: 10 additions & 0 deletions src/cuda/api/launch_config_builder.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,13 @@ class launch_config_builder_t {
{
grid::composite_dimensions_t result;
if (saturate_with_active_blocks_) {
#if CUDA_VERSION >= 10000
if (use_min_params_for_max_occupancy_) {
throw ::std::logic_error(
"Cannot both use the minimum grid parameters for achieving maximum occupancy, _and_ saturate "
"the grid with fixed-size cubs.");
}
#endif
if (not (kernel_)) {
throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
}
Expand All @@ -94,6 +96,7 @@ class launch_config_builder_t {
result.grid = kernel_->max_active_blocks_per_multiprocessor(num_block_threads, dshmem_size);
return result;
}
#if CUDA_VERSION >= 10000
if (use_min_params_for_max_occupancy_) {
if (not (kernel_)) {
throw ::std::logic_error("A kernel must be set to determine the minimum grid parameter sfor m");
Expand All @@ -108,6 +111,7 @@ class launch_config_builder_t {
result.grid = composite_dims.grid;
return result;
}
#endif
if (dimensions_.block and dimensions_.overall) {
result.grid = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.block.value());
result.block = dimensions_.block.value();
Expand Down Expand Up @@ -175,7 +179,9 @@ class launch_config_builder_t {
const kernel_t* kernel_ { nullptr };
optional<device::id_t> device_;
bool saturate_with_active_blocks_ { false };
#if CUDA_VERSION >= 10000
bool use_min_params_for_max_occupancy_ { false };
#endif

static cuda::device_t device(optional<device::id_t> maybe_id)
{
Expand Down Expand Up @@ -528,7 +534,9 @@ class launch_config_builder_t {
}
dimensions_.grid = nullopt;
dimensions_.overall = nullopt;
#if CUDA_VERSION >= 10000
use_min_params_for_max_occupancy_ = false;
#endif
saturate_with_active_blocks_ = true;
return *this;
}
Expand All @@ -541,7 +549,9 @@ class launch_config_builder_t {
dimensions_.block = nullopt;
dimensions_.grid = nullopt;
dimensions_.overall = nullopt;
#if CUDA_VERSION >= 10000
use_min_params_for_max_occupancy_ = true;
#endif
saturate_with_active_blocks_ = false;
return *this;
}
Expand Down
4 changes: 2 additions & 2 deletions src/cuda/api/multi_wrapper_impls/pointer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,8 @@ inline context_t context_of(const void* ptr)
void* value_ptrs[] = {&device_id, &context_handle};
pointer::detail_::get_attributes(2, attributes, value_ptrs, ptr);
#else
auto context_handle = pointer::detail_::context_handle_of(ptr_);
auto device_id = context::detail_::get_device_id(ptr_);
auto context_handle = pointer::detail_::context_handle_of(ptr);
auto device_id = context::detail_::get_device_id(context_handle);
#endif
return context::wrap(device_id, context_handle);
}
Expand Down
147 changes: 64 additions & 83 deletions src/cuda/api/stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,6 @@ inline handle_t create_raw_in_current_context(
CU_STREAM_DEFAULT : CU_STREAM_NON_BLOCKING;
handle_t new_stream_handle;
auto status = cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
// We could instead have used an equivalent Driver API call:
// cuStreamCreateWithPriority(cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
throw_if_error_lazy(status, "Failed creating a new stream in " + detail_::identify(new_stream_handle));
return new_stream_handle;
}
Expand Down Expand Up @@ -150,6 +148,9 @@ inline void record_event_in_current_context(
stream::handle_t stream_handle,
event::handle_t event_handle);

template <typename Function>
void enqueue_function_call(const stream_t& stream, Function function, void * argument);

} // namespace detail_

/**
Expand Down Expand Up @@ -304,59 +305,21 @@ class stream_t {

protected: // static methods

/**
* A function used internally by this class as the host function to call directly; see
* @ref enqueue_t::host_function_call - but only with CUDA version 10.0 and later.
*
* @param stream_handle the ID of the stream for which a host function call was triggered - this
* will be passed by the CUDA runtime
* @param stream_wrapper_members_and_callable a tuple, containing the information necessary to
* recreate the wrapper with which the callback is associated, without any additional CUDA API calls -
* plus the callable which was passed to @ref enqueue_t::host_function_call, and which the programmer
* actually wants to be called.
*
* @note instances of this template are of type {@ref callback_t}.
*/
template <typename Callable>
static void CUDA_CB stream_launched_host_function_adapter(void * stream_wrapper_members_and_callable)
{
using tuple_type = ::std::tuple<device::id_t, context::handle_t , stream::handle_t, Callable>;
auto* tuple_ptr = reinterpret_cast<tuple_type *>(stream_wrapper_members_and_callable);
auto unique_ptr_to_tuple = ::std::unique_ptr<tuple_type>{tuple_ptr}; // Ensures deletion when we leave this function.
auto device_id = ::std::get<0>(*unique_ptr_to_tuple.get());
auto context_handle = ::std::get<1>(*unique_ptr_to_tuple.get());
auto stream_handle = ::std::get<2>(*unique_ptr_to_tuple.get());
const auto& callable = ::std::get<3>(*unique_ptr_to_tuple.get());
callable( stream_t{device_id, context_handle, stream_handle, do_not_take_ownership} );
}

/**
* @brief A function to @ref `host_function_launch_adapter`, for use with the old-style CUDA Runtime API call,
* which passes more arguments to the callable - and calls the host function even on device failures.
* which passes more arguments to the invokable - and calls the host function even on device failures.
*
* @param stream_handle the ID of the stream for which a host function call was triggered - this
* will be passed by the CUDA runtime
* @note status indicates the status the CUDA status when the host function call is triggered; anything
* other than @ref `cuda::status::success` means there's been a device error previously - but
* in that case, we won't invoke the callable, as such execution is deprecated; see:
* in that case, we won't invoke the invokable, as such execution is deprecated; see:
* https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM
* @param device_id_and_callable a pair-value, containing the ID of the device to which the stream launching
* the host function call is associated, as well as the callable callback which was passed to
* @param device_id_and_invokable a pair-value, containing the ID of the device to which the stream launching
* the host function call is associated, as well as the invokable callback which was passed to
* @ref enqueue_t::host_function_call, and which the programmer actually wants to be called.
*/
template <typename Callable>
static void callback_launch_adapter(
stream::handle_t,
status_t status,
void * stream_wrapper_members_and_callable)
{
if (status != cuda::status::success) {
using tuple_type = ::std::tuple<device::id_t, context::handle_t , stream::handle_t, Callable>;
delete reinterpret_cast<tuple_type*>(stream_wrapper_members_and_callable);
return;
}
stream_launched_host_function_adapter<Callable>(stream_wrapper_members_and_callable);
}


public: // mutators

Expand Down Expand Up @@ -518,53 +481,37 @@ class stream_t {
bool records_timing = event::do_record_timings,
bool interprocess = event::not_interprocess) const;

# if CUDA_VERSION >= 10000
/**
* Execute the specified function on the calling host thread once all
* Execute the specified function on the calling host thread, after all
* hereto-scheduled work on this stream has been completed.
*
* @param callable_ a function to execute on the host. It must be callable
* with two parameters: `cuda::stream::handle_t stream_handle, cuda::event::handle_t event_handle`
* @param invokable_ an object to call. It must be invokable/invokable with
* a
*/
template <typename Callable>
void host_function_call(Callable callable_) const
template <typename Argument>
void host_function_call(void (*function)(Argument*), Argument* argument) const
{
context::current::detail_::scoped_override_t set_context_for_this_scope(associated_stream.context_handle_);

// Since callable_ will be going out of scope after the enqueueing,
// and we don't know anything about the scope of the original argument with
// which we were called, we must make a copy of `callable_` on the heap
// and pass that as the user-defined data. We also add information about
// the enqueueing stream.
auto raw_callable_extra_argument = new
::std::tuple<device::id_t, context::handle_t, stream::handle_t, Callable>(
associated_stream.device_id_,
associated_stream.context_handle_,
associated_stream.handle(),
Callable(::std::move(callable_))
);

// While we always register the same static function, `callback_adapter` as the
// callback - what it will actually _do_ is invoke the callback we were passed.

#if CUDA_VERSION >= 10000
auto status = cuLaunchHostFunc(
associated_stream.handle_, &stream_launched_host_function_adapter<Callable>, raw_callable_extra_argument);
// Could have used the equivalent Driver API call: cuLaunchHostFunc()
#else
// The nVIDIA runtime API (at least up to v10.2) requires passing 0 as the flags
// variable, see:
// http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
static constexpr const unsigned fixed_flags { 0u };
auto status = cuStreamAddCallback(
associated_stream.handle_, &callback_launch_adapter<Callable>, raw_callable_extra_argument, fixed_flags);
// Could have used the equivalent Driver API call: cuAddStreamCallback()
// I hope you like function declaration punning :-)
stream::detail_::enqueue_function_call(
associated_stream, reinterpret_cast<stream::detail_::callback_t>(function), argument);
}
#endif

throw_if_error_lazy(status, "Failed scheduling a callback to be launched on "
+ stream::detail_::identify(associated_stream.handle_,
associated_stream.context_handle_, associated_stream.device_id_));
private:
template <typename Invokable>
static void CUDA_CB stream_launched_invoker(void* type_erased_invokable) {
auto invokable = reinterpret_cast<Invokable*>(type_erased_invokable);
(*invokable)();
}

public:
template <typename Invokable>
void host_invokable(Invokable& invokable) const
{
auto type_erased_invoker = reinterpret_cast<stream::detail_::callback_t>(stream_launched_invoker<Invokable>);
stream::detail_::enqueue_function_call(associated_stream, type_erased_invoker, &invokable);
}

#if CUDA_VERSION >= 11020
/**
Expand Down Expand Up @@ -999,6 +946,40 @@ inline CUresult write_value<uint64_t>(CUstream stream_handle, CUdeviceptr addres
return cuStreamWriteValue64(stream_handle, address, value, flags);
}

/**
* A function used internally by this class as the host function to call directly; see
* @ref enqueue_t::host_function_call - but only with CUDA version 10.0 and later.
*
* @param stream_handle the ID of the stream for which a host function call was triggered - this
* will be passed by the CUDA runtime
* @param stream_wrapper_members_and_invokable a tuple, containing the information necessary to
* recreate the wrapper with which the callback is associated, without any additional CUDA API calls -
* plus the invokable which was passed to @ref enqueue_t::host_function_call, and which the programmer
* actually wants to be called.
*
* @note instances of this template are of type {@ref callback_t}.
*/
template <typename Function>
void enqueue_function_call(const stream_t& stream, Function function, void* argument)
{
context::current::detail_::scoped_override_t set_context_for_this_scope(stream.context_handle());

// While we always register the same static function, `callback_adapter` as the
// callback - what it will actually _do_ is invoke the callback we were passed.

#if CUDA_VERSION >= 10000
auto status = cuLaunchHostFunc(stream.handle(), function, argument);
// Could have used the equivalent Driver API call: cuLaunchHostFunc()
#else
// The nVIDIA runtime API (at least up to v10.2) requires passing 0 as the flags
// variable, see:
// http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
static constexpr const unsigned fixed_flags { 0u };
auto status = cuStreamAddCallback(stream.handle(), function, argument, fixed_flags);
#endif
throw_if_error_lazy(status, "Failed enqueuing a host function/invokable to be launched on " + stream::detail_::identify(stream));
}

} // namespace detail_

/**
Expand Down
13 changes: 11 additions & 2 deletions src/cuda/api/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,17 @@ enum : priority_t {
default_priority = 0
};

namespace detail_ {

#if CUDA_VERSION >= 10000
using callback_t = CUhostFn;
#else
using callback_t = CUstreamCallback;
#endif

} // namespace detail_


} // namespace stream

namespace grid {
Expand Down Expand Up @@ -913,8 +924,6 @@ using handle_t = CUfunction;

} // namespace kernel

using callback_t = CUhostFn;

// The C++ standard library doesn't offer ::std::dynarray (although it almost did),
// and we won't introduce our own here. So...
template <typename T>
Expand Down

0 comments on commit 86f6d45

Please sign in to comment.