From ed7775371a039971fe8a49d487c02622f80f459c Mon Sep 17 00:00:00 2001 From: Eyal Rozenberg Date: Thu, 25 Jan 2024 22:07:59 +0200 Subject: [PATCH] Fixes #565: Completed the implementation of a library kernel class - completing the support for libraries overall CAVEAT: No example code to actually load and launch any cuLibraries --- src/cuda/api.hpp | 1 + src/cuda/api/kernel_launch.hpp | 38 +++- src/cuda/api/kernels/in_library.hpp | 206 ++++++++++++++++++ src/cuda/api/library.hpp | 52 ++--- .../api/multi_wrapper_impls/kernel_launch.hpp | 39 +++- .../multi_wrapper_impls/library_kernel.hpp | 50 +++++ 6 files changed, 354 insertions(+), 32 deletions(-) create mode 100644 src/cuda/api/kernels/in_library.hpp create mode 100644 src/cuda/api/multi_wrapper_impls/library_kernel.hpp diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp index a6143017..1f4a4657 100644 --- a/src/cuda/api.hpp +++ b/src/cuda/api.hpp @@ -42,6 +42,7 @@ #include "api/module.hpp" #if CUDA_VERSION >= 12000 #include "api/library.hpp" +#include "api/kernels/in_library.hpp" #endif #include "api/link.hpp" diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp index a57092d9..eb7bdfb4 100644 --- a/src/cuda/api/kernel_launch.hpp +++ b/src/cuda/api/kernel_launch.hpp @@ -43,6 +43,10 @@ #include "launch_configuration.hpp" #include "kernel.hpp" #include "kernels/apriori_compiled.hpp" +#if CUDA_VERSION >= 12000 +#include "kernels/in_library.hpp" +#endif + #if CUDA_VERSION >= 9000 // The following is necessary for cudaLaunchCooperativeKernel @@ -124,6 +128,7 @@ struct enqueue_launch_helper { template void enqueue_launch( + ::std::integral_constant, ::std::integral_constant, Kernel&& kernel_function, const stream_t& stream, @@ -132,6 +137,16 @@ void enqueue_launch( template void enqueue_launch( + ::std::integral_constant, + ::std::integral_constant, + Kernel&& kernel, + const stream_t& stream, + launch_configuration_t launch_configuration, + KernelParameters&&... parameters); + +template +void enqueue_launch( + ::std::integral_constant, ::std::integral_constant, Kernel&& kernel, const stream_t& stream, @@ -308,12 +323,18 @@ void enqueue_launch( static_assert( detail_::all_true<::std::is_trivially_copy_constructible>::value...>::value, "All kernel parameter types must be of a trivially copy-constructible (decayed) type." ); - static constexpr const bool wrapped_kernel = ::std::is_base_of::type>::value; + static constexpr const bool wrapped_contextual_kernel = ::std::is_base_of::type>::value; +#if CUDA_VERSION >= 12000 + static constexpr const bool library_kernel = cuda::detail_::is_library_kernel::value; +#else + static constexpr const bool library_kernel = false; +#endif // CUDA_VERSION >= 12000 // We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to // use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch // function for each of them will basically be just a one-liner :-( detail_::enqueue_launch( - ::std::integral_constant{}, + ::std::integral_constant{}, + ::std::integral_constant{}, ::std::forward(kernel), stream, launch_configuration, ::std::forward(parameters)...); } @@ -338,12 +359,15 @@ void launch( * Type of the container for the marshalled arguments; typically, this * would be `span` - but it can be an `::std::vector`, or * have non-const `void*` elements etc. + * @param kernel + * A wrapped GPU kernel * @param stream * Proxy for the stream on which to enqueue the kernel launch; may be the * default stream of a context. * @param marshalled_arguments * A container of `void` or `const void` pointers to the argument values */ +///@{ template void launch_type_erased( const kernel_t& kernel, @@ -351,6 +375,16 @@ void launch_type_erased( launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments); +#if CUDA_VERSION >= 12000 +template +void launch_type_erased( + const library::kernel_t& kernel, + const stream_t& stream, + launch_configuration_t launch_configuration, + SpanOfConstVoidPtrLike marshalled_arguments); +///@} +#endif // CUDA_VERSION >= 12000 + } // namespace cuda #endif // CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_ diff --git a/src/cuda/api/kernels/in_library.hpp b/src/cuda/api/kernels/in_library.hpp new file mode 100644 index 00000000..9e7225a6 --- /dev/null +++ b/src/cuda/api/kernels/in_library.hpp @@ -0,0 +1,206 @@ +/** + * @file + * + * @brief A @ref A wrapper class for compiled kernels in a loaded library, + * which are unassociated with a device and a context. + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_IN_LIBRARY_KERNEL_HPP_ +#define CUDA_API_WRAPPERS_IN_LIBRARY_KERNEL_HPP_ + +#include "../library.hpp" + +namespace cuda { + +///@cond +class kernel_t; +///@nocond + +namespace detail_ { + +template +struct is_library_kernel : ::std::is_same::type, library::kernel_t> { }; + +} // namespace detail_ + +// TODO: Avoid the copy? +kernel_t contextualize(const library::kernel_t& kernel, const context_t& context); + +namespace library { + +///@cond +class kernel_t; +///@nocond + +namespace kernel { + +using handle_t = CUkernel; +using cuda::kernel::attribute_t; +using cuda::kernel::attribute_value_t; +// using cuda::kernel::apriori_compiled::attributes_t; + +namespace detail_ { + +// Note: library kernels never hold a PC refcount unit, nor do they own anything; +// only the library wrapper owns (and it's not associated with the kernel). +kernel_t wrap(library::handle_t library_handle, kernel::handle_t handle); + +inline ::std::string identify(kernel::handle_t handle) +{ + return "library kernel at " + cuda::detail_::ptr_as_hex(handle); +} + +inline ::std::string identify(library::handle_t library_handle, kernel::handle_t handle) +{ + return identify(handle) + " within " + library::detail_::identify(library_handle); +} + +::std::string identify(const kernel_t &kernel); + +inline ::std::pair contextualize_in_current_context( + const kernel::handle_t& library_kernel_handle) +{ + cuda::kernel::handle_t contextualized_kernel_handle; + auto status = cuKernelGetFunction(&contextualized_kernel_handle, library_kernel_handle); + return {contextualized_kernel_handle, status}; +} + +inline cuda::kernel::handle_t contextualize( + const handle_t& kernel_handle, + const context::handle_t context_handle) +{ + CAW_SET_SCOPE_CONTEXT(context_handle); + auto handle_and_status = contextualize_in_current_context(kernel_handle); + throw_if_error_lazy(handle_and_status.second, "Failed placing " + identify(kernel_handle) + " in " + + context::detail_::identify(context_handle)); + return handle_and_status.first; +} + +inline attribute_value_t get_attribute( + handle_t library_kernel_handle, + device::id_t device_id, + kernel::attribute_t attribute) +{ + attribute_value_t value; + auto status = cuKernelGetAttribute(&value, attribute, library_kernel_handle, device_id); + throw_if_error_lazy(status, ::std::string("Failed getting attribute ") + + cuda::kernel::detail_::attribute_name(attribute) + " for " + identify(library_kernel_handle) + + " on " + device::detail_::identify(device_id)); +} + +inline attribute_value_t set_attribute( + kernel::handle_t library_kernel_handle, + device::id_t device_id, + kernel::attribute_t attribute, + attribute_value_t value) +{ + auto status = cuKernelSetAttribute(attribute, value, library_kernel_handle, device_id); + throw_if_error_lazy(status, ::std::string("Failed setting attribute ") + + cuda::kernel::detail_::attribute_name(attribute) + " value to " + ::std::to_string(value) + + " for " + identify(library_kernel_handle) + " on " + device::detail_::identify(device_id)); +} + +} // namespace detail + +attribute_value_t get_attribute( + const library::kernel_t& library_kernel, + kernel::attribute_t attribute, + const device_t& device); + +inline void set_attribute( + const library::kernel_t& library_kernel, + kernel::attribute_t attribute, + const device_t& device, + attribute_value_t value); + +} // namespace kernel + +/** + * @brief A subclass of the @ref `kernel_t` interface for kernels being + * functions marked as __global__ in source files and compiled apriori. + */ +class kernel_t { +public: // getters + kernel::handle_t handle() const noexcept { return handle_; } + library::handle_t library_handle() const noexcept { return library_handle_; } + library_t library() const noexcept { return library::detail_::wrap(library_handle_); } + +public: // type_conversions + +public: // non-mutators + +#if CUDA_VERSION >= 12300 + /** + * Return the kernel function name as registered within its library + * + * @note This may return a mangled name if the kernel function was not declared as having C linkage. + */ + const char* name() const + { + if (name_ != nullptr) { return name_; } + const char* result; + auto status = cuKernelGetName(&result, handle_); + throw_if_error_lazy(status, "Retrieving the name of " + kernel::detail_::identify(*this)); + name_ = result; + return name_; + } +#endif + cuda::kernel_t contextualize(const context_t& context) const; + +protected: // ctors & dtor + kernel_t(library::handle_t library_handle, kernel::handle_t handle) + : + library_handle_(library_handle), handle_(handle) {} + +public: // ctors & dtor + kernel_t(const kernel_t &) = default; + kernel_t(kernel_t&& other) = default; + +public: // friends + friend kernel_t kernel::detail_::wrap(library::handle_t, kernel::handle_t); + +protected: // data members + library::handle_t library_handle_; + kernel::handle_t handle_; + mutable const char* name_ { nullptr }; // The name is cached after having been retrieved for the first time +}; // kernel_t + +namespace kernel { +namespace detail_ { + +inline kernel_t wrap(library::handle_t library_handle, kernel::handle_t handle) +{ + return {library_handle, handle}; +} + +inline ::std::string identify(const kernel_t& library_kernel) +{ + return identify(library_kernel.library_handle(), library_kernel.handle()); +} + +} // namespace detail_ + +inline kernel_t get(const library_t& library, const char* name) +{ + auto kernel_handle = library::detail_::get_kernel(library.handle(), name); + return kernel::detail_::wrap(library.handle(), kernel_handle); +} + +} // namespace kernel + +} // namespace library + +inline library::kernel_t library_t::get_kernel(const char* name) const +{ + return library::kernel::get(*this, name); +} + +inline library::kernel_t library_t::get_kernel(const ::std::string& name) const +{ + return get_kernel(name.c_str()); +} + +} // namespace cuda + +#endif // CUDA_API_WRAPPERS_IN_LIBRARY_KERNEL_HPP_ + diff --git a/src/cuda/api/library.hpp b/src/cuda/api/library.hpp index b8cc4eac..898666f1 100644 --- a/src/cuda/api/library.hpp +++ b/src/cuda/api/library.hpp @@ -11,6 +11,7 @@ #if CUDA_VERSION >= 12000 #include "module.hpp" +#include "error.hpp" #if __cplusplus >= 201703L #include @@ -19,7 +20,6 @@ namespace cuda { ///@cond -class device_t; class context_t; class module_t; class library_t; @@ -74,10 +74,25 @@ library_t create( bool code_is_preserved); ///@} + +namespace detail_ { + +inline kernel::handle_t get_kernel(handle_t library_handle, const char* name) +{ + library::kernel::handle_t kernel_handle; + auto status = cuLibraryGetKernel(&kernel_handle, library_handle, name); + throw_if_error_lazy(status, ::std::string{"Failed obtaining kernel "} + name + + "' from " + library::detail_::identify(library_handle)); + return kernel_handle; +} + +} // namespace detail_ + +inline kernel_t get_kernel(const library_t& library, const char* name); + } // namespace library memory::region_t get_global(const context_t& context, const library_t& library, const char* name); -kernel_t get_kernel(const context_t& context, const library_t& library, const char* name); memory::region_t get_managed_region(const library_t& library, const char* name); namespace module { @@ -110,15 +125,8 @@ class library_t { * @return An enqueable kernel proxy object for the requested kernel, * in the current context. */ - cuda::kernel_t get_kernel(const char* name) const - { - return cuda::get_kernel(context::current::get(), *this, name); - } - - cuda::kernel_t get_kernel(const ::std::string& name) const - { - return get_kernel(name.c_str()); - } + library::kernel_t get_kernel(const char* name) const; + library::kernel_t get_kernel(const ::std::string& name) const; memory::region_t get_global(const char* name) const { @@ -196,22 +204,10 @@ inline memory::region_t get_global(const context_t& context, const library_t& li // Note: Nothing is holding a PC refcount unit here! } -// Implement other get's +// More library item getters +namespace library { -inline kernel_t get_kernel(const context_t& context, const library_t& library, const char* name) -{ - CAW_SET_SCOPE_CONTEXT(context.handle()); - library::kernel::handle_t new_handle; - auto status = cuLibraryGetKernel(&new_handle, library.handle(), name); - throw_if_error_lazy(status, ::std::string("Failed obtaining kernel '") + name - + "' from " + library::detail_::identify(library)); - kernel::handle_t new_proper_kernel_handle; - status = cuKernelGetFunction(&new_proper_kernel_handle, new_handle); - throw_if_error_lazy(status, ::std::string("Failed obtaining a context-associated kernel ") - + "from kernel '" + name + "' in " + library::detail_::identify(library)); - return kernel::wrap(context.device_id(), context.handle(), - new_proper_kernel_handle, do_hold_primary_context_refcount_unit); -} +} // namespace library inline memory::region_t get_managed_region(const library_t& library, const char* name) { @@ -225,6 +221,9 @@ inline memory::region_t get_managed_region(const library_t& library, const char* namespace module { +/** + * Create an in-context module from the compiled code within a loaded library + */ inline module_t create(const context_t& context, const library_t& library) { CAW_SET_SCOPE_CONTEXT(context.handle()); @@ -241,6 +240,7 @@ inline module_t create(const context_t& context, const library_t& library) } // namespace module +// I really have no idea what this does! inline void* get_unified_function(const context_t& context, const library_t& library, const char* symbol) { CAW_SET_SCOPE_CONTEXT(context.handle()); diff --git a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp index e546430a..24278859 100644 --- a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp +++ b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp @@ -207,7 +207,8 @@ struct enqueue_launch_helper { template void enqueue_launch( - ::std::integral_constant, // Got a raw kernel function + ::std::integral_constant, // Not a wrapped contextual kernel, + ::std::integral_constant, // and not a library kernel, so it must be a raw kernel function RawKernelFunction&& kernel_function, const stream_t& stream, launch_configuration_t launch_configuration, @@ -228,7 +229,8 @@ void enqueue_launch( template void enqueue_launch( -::std::integral_constant, // a kernel wrapped in a kernel_t (sub)class + ::std::integral_constant, // a kernel wrapped in a kernel_t (sub)class + ::std::integral_constant, // Not a library kernel Kernel&& kernel, const stream_t& stream, launch_configuration_t launch_configuration, @@ -239,6 +241,23 @@ ::std::integral_constant, // a kernel wrapped in a kernel_t (sub)cla ::std::forward(parameters)...); } +#if CUDA_VERSION >= 12000 +template +void enqueue_launch( + ::std::integral_constant, // Not a wrapped contextual kernel, + ::std::integral_constant, // but a library kernel + Kernel&& kernel, + const stream_t& stream, + launch_configuration_t launch_configuration, + KernelParameters&&... parameters) +{ + kernel_t contextualized = cuda::contextualize(kernel, stream.context()); + enqueue_launch_helper {}( + contextualized, stream, launch_configuration, + ::std::forward(parameters)...); +} +#endif // CUDA_VERSION >= 12000 + } // namespace detail_ template @@ -253,8 +272,7 @@ inline void launch( // Note: If Kernel is a kernel_t, and its associated device is different // than the current device, the next call will fail: - enqueue_launch(kernel, stream, launch_configuration, - ::std::forward(parameters)...); + enqueue_launch(kernel, stream, launch_configuration, ::std::forward(parameters)...); } template @@ -283,6 +301,19 @@ inline void launch_type_erased( static_cast(marshalled_arguments.data())); } +#if CUDA_VERSION >= 12000 +template +void launch_type_erased( + const library::kernel_t& kernel, + const stream_t& stream, + launch_configuration_t launch_configuration, + SpanOfConstVoidPtrLike marshalled_arguments) +{ + auto contextualized = contextualize(kernel, stream.context()); + launch_type_erased(contextualized, stream, launch_configuration, marshalled_arguments); +} +#endif // CUDA_VERSION >= 12000 + #if ! CAN_GET_APRIORI_KERNEL_HANDLE #if defined(__CUDACC__) diff --git a/src/cuda/api/multi_wrapper_impls/library_kernel.hpp b/src/cuda/api/multi_wrapper_impls/library_kernel.hpp new file mode 100644 index 00000000..4b4d4374 --- /dev/null +++ b/src/cuda/api/multi_wrapper_impls/library_kernel.hpp @@ -0,0 +1,50 @@ +/** + * @file + * + * @brief Implementations requiring the definitions of multiple CUDA entity proxy classes, + * and which regard (non-contextualized) library kernels. + */ +#pragma once +#ifndef CUDA_API_WRAPPERS_MULTI_WRAPPER_LIBRARY_KERNEL_HPP +#define CUDA_API_WRAPPERS_MULTI_WRAPPER_LIBRARY_KERNEL_HPP + +#include "kernel.hpp" +#include "../library.hpp" + +namespace cuda { + +namespace library { + +namespace kernel { + +attribute_value_t get_attribute( + const library::kernel_t& library_kernel, + kernel::attribute_t attribute, + const device_t& device) +{ + return detail_::get_attribute(library_kernel.handle(), device.id(), attribute); +} + +inline void set_attribute( + const library::kernel_t& library_kernel, + kernel::attribute_t attribute, + const device_t& device, + attribute_value_t value) +{ + detail_::set_attribute(library_kernel.handle(), device.id(), attribute, value); +} + +cuda::kernel_t contextualize(const kernel_t& kernel, const context_t& context) +{ + auto new_handle = detail_::contextualize(kernel.handle(), context.handle()); + using cuda::kernel::wrap; + return wrap(context.device_id(), context.handle(), new_handle, do_not_hold_primary_context_refcount_unit); +} + +} // namespace kernel + +} // namespace library + +} // namespace cuda + +#endif CUDA_API_WRAPPERS_MULTI_WRAPPER_LIBRARY_KERNEL_HPP