Fixes #565: Completed the implementation of a library kernel class - …

…completing the support for libraries overall CAVEAT: No example code to actually load and launch any cuLibraries
eyalroz · Mar 1, 2024 · ed77753 · ed77753
1 parent 53d715b
commit ed77753
Show file tree

Hide file tree

Showing 6 changed files with 354 additions and 32 deletions.
diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp
@@ -42,6 +42,7 @@
 #include "api/module.hpp"
 #if CUDA_VERSION >= 12000
 #include "api/library.hpp"
+#include "api/kernels/in_library.hpp"
 #endif
 #include "api/link.hpp"
 

diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp
@@ -43,6 +43,10 @@
 #include "launch_configuration.hpp"
 #include "kernel.hpp"
 #include "kernels/apriori_compiled.hpp"
+#if CUDA_VERSION >= 12000
+#include "kernels/in_library.hpp"
+#endif
+
 
 #if CUDA_VERSION >= 9000
 // The following is necessary for cudaLaunchCooperativeKernel
@@ -124,6 +128,7 @@ struct enqueue_launch_helper {
 
 template<typename Kernel, typename... KernelParameters>
 void enqueue_launch(
+	::std::integral_constant<bool, false>,
 	::std::integral_constant<bool, false>,
 	Kernel&&                kernel_function,
 	const stream_t&         stream,
@@ -132,6 +137,16 @@ void enqueue_launch(
 
 template<typename Kernel, typename... KernelParameters>
 void enqueue_launch(
+	::std::integral_constant<bool, true>,
+	::std::integral_constant<bool, false>,
+	Kernel&&                kernel,
+	const stream_t&         stream,
+	launch_configuration_t  launch_configuration,
+	KernelParameters&&...   parameters);
+
+template<typename Kernel, typename... KernelParameters>
+void enqueue_launch(
+	::std::integral_constant<bool, false>,
 	::std::integral_constant<bool, true>,
 	Kernel&&                kernel,
 	const stream_t&         stream,
@@ -308,12 +323,18 @@ void enqueue_launch(
 	static_assert(
 		detail_::all_true<::std::is_trivially_copy_constructible<detail_::kernel_parameter_decay_t<KernelParameters>>::value...>::value,
 		"All kernel parameter types must be of a trivially copy-constructible (decayed) type." );
-	static constexpr const bool wrapped_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
+	static constexpr const bool wrapped_contextual_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
+#if CUDA_VERSION >= 12000
+	static constexpr const bool library_kernel = cuda::detail_::is_library_kernel<Kernel>::value;
+#else
+	static constexpr const bool library_kernel = false;
+#endif // CUDA_VERSION >= 12000
 	// We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to
 	// use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch
 	// function for each of them will basically be just a one-liner :-(
 	detail_::enqueue_launch<Kernel, KernelParameters...>(
-		::std::integral_constant<bool, wrapped_kernel>{},
+		::std::integral_constant<bool, wrapped_contextual_kernel>{},
+		::std::integral_constant<bool, library_kernel>{},
 		::std::forward<Kernel>(kernel), stream, launch_configuration,
 		::std::forward<KernelParameters>(parameters)...);
 }
@@ -338,19 +359,32 @@ void launch(
  *     Type of the container for the marshalled arguments; typically, this
  *     would be `span<const void*>` - but it can be an `::std::vector`, or
  *     have non-const `void*` elements etc.
+ * @param kernel
+ *     A wrapped GPU kernel
  * @param stream
  *     Proxy for the stream on which to enqueue the kernel launch; may be the
  *     default stream of a context.
  * @param marshalled_arguments
  *     A container of `void` or `const void` pointers to the argument values
  */
+///@{
 template <typename SpanOfConstVoidPtrLike>
 void launch_type_erased(
 	const kernel_t&         kernel,
 	const stream_t&         stream,
 	launch_configuration_t  launch_configuration,
 	SpanOfConstVoidPtrLike  marshalled_arguments);
 
+#if CUDA_VERSION >= 12000
+template <typename SpanOfConstVoidPtrLike>
+void launch_type_erased(
+	const library::kernel_t&  kernel,
+	const stream_t&           stream,
+	launch_configuration_t    launch_configuration,
+	SpanOfConstVoidPtrLike    marshalled_arguments);
+///@}
+#endif // CUDA_VERSION >= 12000
+
 } // namespace cuda
 
 #endif // CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_
diff --git a/src/cuda/api/kernels/in_library.hpp b/src/cuda/api/kernels/in_library.hpp
@@ -0,0 +1,206 @@
+/**
+ * @file
+ *
+ * @brief A @ref A wrapper class for compiled kernels in a loaded library,
+ * which are unassociated with a device and a context.
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_IN_LIBRARY_KERNEL_HPP_
+#define CUDA_API_WRAPPERS_IN_LIBRARY_KERNEL_HPP_
+
+#include "../library.hpp"
+
+namespace cuda {
+
+///@cond
+class kernel_t;
+///@nocond
+
+namespace detail_ {
+
+template <typename Kernel>
+struct is_library_kernel : ::std::is_same<typename ::std::decay<Kernel>::type, library::kernel_t> { };
+
+} // namespace detail_
+
+// TODO: Avoid the copy?
+kernel_t contextualize(const library::kernel_t& kernel, const context_t& context);
+
+namespace library {
+
+///@cond
+class kernel_t;
+///@nocond
+
+namespace kernel {
+
+using handle_t = CUkernel;
+using cuda::kernel::attribute_t;
+using cuda::kernel::attribute_value_t;
+// using cuda::kernel::apriori_compiled::attributes_t;
+
+namespace detail_ {
+
+// Note: library kernels never hold a PC refcount unit, nor do they own anything;
+// only the library wrapper owns (and it's not associated with the kernel).
+kernel_t wrap(library::handle_t library_handle, kernel::handle_t handle);
+
+inline ::std::string identify(kernel::handle_t handle)
+{
+	return "library kernel at " + cuda::detail_::ptr_as_hex(handle);
+}
+
+inline ::std::string identify(library::handle_t library_handle, kernel::handle_t handle)
+{
+	return identify(handle) + " within " + library::detail_::identify(library_handle);
+}
+
+::std::string identify(const kernel_t &kernel);
+
+inline ::std::pair<cuda::kernel::handle_t, status_t> contextualize_in_current_context(
+	const kernel::handle_t& library_kernel_handle)
+{
+	cuda::kernel::handle_t contextualized_kernel_handle;
+	auto status = cuKernelGetFunction(&contextualized_kernel_handle, library_kernel_handle);
+	return {contextualized_kernel_handle, status};
+}
+
+inline cuda::kernel::handle_t contextualize(
+	const handle_t& kernel_handle,
+	const context::handle_t context_handle)
+{
+	CAW_SET_SCOPE_CONTEXT(context_handle);
+	auto handle_and_status = contextualize_in_current_context(kernel_handle);
+	throw_if_error_lazy(handle_and_status.second, "Failed placing " + identify(kernel_handle) + " in "
+		+ context::detail_::identify(context_handle));
+	return handle_and_status.first;
+}
+
+inline attribute_value_t get_attribute(
+	handle_t             library_kernel_handle,
+	device::id_t         device_id,
+	kernel::attribute_t  attribute)
+{
+	attribute_value_t value;
+	auto status = cuKernelGetAttribute(&value, attribute, library_kernel_handle, device_id);
+	throw_if_error_lazy(status, ::std::string("Failed getting attribute ")
+		+ cuda::kernel::detail_::attribute_name(attribute) + " for " + identify(library_kernel_handle)
+		+ " on " + device::detail_::identify(device_id));
+}
+
+inline attribute_value_t set_attribute(
+	kernel::handle_t     library_kernel_handle,
+	device::id_t         device_id,
+	kernel::attribute_t  attribute,
+	attribute_value_t    value)
+{
+	auto status = cuKernelSetAttribute(attribute, value, library_kernel_handle, device_id);
+	throw_if_error_lazy(status, ::std::string("Failed setting attribute ")
+								+ cuda::kernel::detail_::attribute_name(attribute) + " value to " + ::std::to_string(value)
+								+ " for " + identify(library_kernel_handle) + " on " + device::detail_::identify(device_id));
+}
+
+} // namespace detail
+
+attribute_value_t get_attribute(
+	const library::kernel_t&  library_kernel,
+	kernel::attribute_t       attribute,
+	const device_t&           device);
+
+inline void set_attribute(
+	const library::kernel_t&  library_kernel,
+	kernel::attribute_t       attribute,
+	const device_t&           device,
+	attribute_value_t         value);
+
+} // namespace kernel
+
+/**
+ * @brief A subclass of the @ref `kernel_t` interface for kernels being
+ * functions marked as __global__ in source files and compiled apriori.
+ */
+class kernel_t {
+public: // getters
+	kernel::handle_t handle() const noexcept { return handle_; }
+	library::handle_t library_handle() const noexcept { return library_handle_; }
+	library_t library() const noexcept { return library::detail_::wrap(library_handle_); }
+
+public: // type_conversions
+
+public: // non-mutators
+
+#if CUDA_VERSION >= 12300
+	/**
+	 * Return the kernel function name as registered within its library
+	 *
+	 * @note This may return a mangled name if the kernel function was not declared as having C linkage.
+	 */
+	const char* name() const
+	{
+		if (name_ != nullptr) { return name_; }
+		const char* result;
+		auto status = cuKernelGetName(&result, handle_);
+		throw_if_error_lazy(status, "Retrieving the name of " + kernel::detail_::identify(*this));
+		name_ = result;
+		return name_;
+	}
+#endif
+	cuda::kernel_t contextualize(const context_t& context) const;
+
+protected: // ctors & dtor
+	kernel_t(library::handle_t library_handle, kernel::handle_t handle)
+	:
+	library_handle_(library_handle), handle_(handle) {}
+
+public: // ctors & dtor
+	kernel_t(const kernel_t &) = default;
+	kernel_t(kernel_t&& other) = default;
+
+public: // friends
+	friend kernel_t kernel::detail_::wrap(library::handle_t, kernel::handle_t);
+
+protected: // data members
+	library::handle_t library_handle_;
+	kernel::handle_t handle_;
+	mutable const char* name_ { nullptr }; // The name is cached after having been retrieved for the first time
+}; // kernel_t
+
+namespace kernel {
+namespace detail_ {
+
+inline kernel_t wrap(library::handle_t library_handle, kernel::handle_t handle)
+{
+	return {library_handle, handle};
+}
+
+inline ::std::string identify(const kernel_t& library_kernel)
+{
+	return identify(library_kernel.library_handle(), library_kernel.handle());
+}
+
+} // namespace detail_
+
+inline kernel_t get(const library_t& library, const char* name)
+{
+	auto kernel_handle = library::detail_::get_kernel(library.handle(), name);
+	return kernel::detail_::wrap(library.handle(), kernel_handle);
+}
+
+} // namespace kernel
+
+} // namespace library
+
+inline library::kernel_t library_t::get_kernel(const char* name) const
+{
+	return library::kernel::get(*this, name);
+}
+
+inline library::kernel_t library_t::get_kernel(const ::std::string& name) const
+{
+	return get_kernel(name.c_str());
+}
+
+} // namespace cuda
+
+#endif // CUDA_API_WRAPPERS_IN_LIBRARY_KERNEL_HPP_
+