From ed7775371a039971fe8a49d487c02622f80f459c Mon Sep 17 00:00:00 2001
From: Eyal Rozenberg <eyalroz1@gmx.com>
Date: Thu, 25 Jan 2024 22:07:59 +0200
Subject: [PATCH] Fixes #565: Completed the implementation of a library kernel
 class - completing the support for libraries overall

CAVEAT: No example code to actually load and launch any cuLibraries
---
 src/cuda/api.hpp                              |   1 +
 src/cuda/api/kernel_launch.hpp                |  38 +++-
 src/cuda/api/kernels/in_library.hpp           | 206 ++++++++++++++++++
 src/cuda/api/library.hpp                      |  52 ++---
 .../api/multi_wrapper_impls/kernel_launch.hpp |  39 +++-
 .../multi_wrapper_impls/library_kernel.hpp    |  50 +++++
 6 files changed, 354 insertions(+), 32 deletions(-)
 create mode 100644 src/cuda/api/kernels/in_library.hpp
 create mode 100644 src/cuda/api/multi_wrapper_impls/library_kernel.hpp
diff --git a/src/cuda/api.hpp b/src/cuda/api.hpp
index a6143017..1f4a4657 100644
--- a/src/cuda/api.hpp
+++ b/src/cuda/api.hpp
@@ -42,6 +42,7 @@
 #include "api/module.hpp"
 #if CUDA_VERSION >= 12000
 #include "api/library.hpp"
+#include "api/kernels/in_library.hpp"
 #endif
 #include "api/link.hpp"
 
diff --git a/src/cuda/api/kernel_launch.hpp b/src/cuda/api/kernel_launch.hpp
index a57092d9..eb7bdfb4 100644
--- a/src/cuda/api/kernel_launch.hpp
+++ b/src/cuda/api/kernel_launch.hpp
@@ -43,6 +43,10 @@
 #include "launch_configuration.hpp"
 #include "kernel.hpp"
 #include "kernels/apriori_compiled.hpp"
+#if CUDA_VERSION >= 12000
+#include "kernels/in_library.hpp"
+#endif
+
 
 #if CUDA_VERSION >= 9000
 // The following is necessary for cudaLaunchCooperativeKernel
@@ -124,6 +128,7 @@ struct enqueue_launch_helper {
 
 template<typename Kernel, typename... KernelParameters>
 void enqueue_launch(
+	::std::integral_constant<bool, false>,
 	::std::integral_constant<bool, false>,
 	Kernel&&                kernel_function,
 	const stream_t&         stream,
@@ -132,6 +137,16 @@ void enqueue_launch(
 
 template<typename Kernel, typename... KernelParameters>
 void enqueue_launch(
+	::std::integral_constant<bool, true>,
+	::std::integral_constant<bool, false>,
+	Kernel&&                kernel,
+	const stream_t&         stream,
+	launch_configuration_t  launch_configuration,
+	KernelParameters&&...   parameters);
+
+template<typename Kernel, typename... KernelParameters>
+void enqueue_launch(
+	::std::integral_constant<bool, false>,
 	::std::integral_constant<bool, true>,
 	Kernel&&                kernel,
 	const stream_t&         stream,
@@ -308,12 +323,18 @@ void enqueue_launch(
 	static_assert(
 		detail_::all_true<::std::is_trivially_copy_constructible<detail_::kernel_parameter_decay_t<KernelParameters>>::value...>::value,
 		"All kernel parameter types must be of a trivially copy-constructible (decayed) type." );
-	static constexpr const bool wrapped_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
+	static constexpr const bool wrapped_contextual_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
+#if CUDA_VERSION >= 12000
+	static constexpr const bool library_kernel = cuda::detail_::is_library_kernel<Kernel>::value;
+#else
+	static constexpr const bool library_kernel = false;
+#endif // CUDA_VERSION >= 12000
 	// We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to
 	// use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch
 	// function for each of them will basically be just a one-liner :-(
 	detail_::enqueue_launch<Kernel, KernelParameters...>(
-		::std::integral_constant<bool, wrapped_kernel>{},
+		::std::integral_constant<bool, wrapped_contextual_kernel>{},
+		::std::integral_constant<bool, library_kernel>{},
 		::std::forward<Kernel>(kernel), stream, launch_configuration,
 		::std::forward<KernelParameters>(parameters)...);
 }
@@ -338,12 +359,15 @@ void launch(
  *     Type of the container for the marshalled arguments; typically, this
  *     would be `span<const void*>` - but it can be an `::std::vector`, or
  *     have non-const `void*` elements etc.
+ * @param kernel
+ *     A wrapped GPU kernel
  * @param stream
  *     Proxy for the stream on which to enqueue the kernel launch; may be the
  *     default stream of a context.
  * @param marshalled_arguments
  *     A container of `void` or `const void` pointers to the argument values
  */
+///@{
 template <typename SpanOfConstVoidPtrLike>
 void launch_type_erased(
 	const kernel_t&         kernel,
@@ -351,6 +375,16 @@ void launch_type_erased(
 	launch_configuration_t  launch_configuration,
 	SpanOfConstVoidPtrLike  marshalled_arguments);
 
+#if CUDA_VERSION >= 12000
+template <typename SpanOfConstVoidPtrLike>
+void launch_type_erased(
+	const library::kernel_t&  kernel,
+	const stream_t&           stream,
+	launch_configuration_t    launch_configuration,
+	SpanOfConstVoidPtrLike    marshalled_arguments);
+///@}
+#endif // CUDA_VERSION >= 12000
+
 } // namespace cuda
 
 #endif // CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_
diff --git a/src/cuda/api/kernels/in_library.hpp b/src/cuda/api/kernels/in_library.hpp
new file mode 100644
index 00000000..9e7225a6
--- /dev/null
+++ b/src/cuda/api/kernels/in_library.hpp
@@ -0,0 +1,206 @@
+/**
+ * @file
+ *
+ * @brief A @ref A wrapper class for compiled kernels in a loaded library,
+ * which are unassociated with a device and a context.
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_IN_LIBRARY_KERNEL_HPP_
+#define CUDA_API_WRAPPERS_IN_LIBRARY_KERNEL_HPP_
+
+#include "../library.hpp"
+
+namespace cuda {
+
+///@cond
+class kernel_t;
+///@nocond
+
+namespace detail_ {
+
+template <typename Kernel>
+struct is_library_kernel : ::std::is_same<typename ::std::decay<Kernel>::type, library::kernel_t> { };
+
+} // namespace detail_
+
+// TODO: Avoid the copy?
+kernel_t contextualize(const library::kernel_t& kernel, const context_t& context);
+
+namespace library {
+
+///@cond
+class kernel_t;
+///@nocond
+
+namespace kernel {
+
+using handle_t = CUkernel;
+using cuda::kernel::attribute_t;
+using cuda::kernel::attribute_value_t;
+// using cuda::kernel::apriori_compiled::attributes_t;
+
+namespace detail_ {
+
+// Note: library kernels never hold a PC refcount unit, nor do they own anything;
+// only the library wrapper owns (and it's not associated with the kernel).
+kernel_t wrap(library::handle_t library_handle, kernel::handle_t handle);
+
+inline ::std::string identify(kernel::handle_t handle)
+{
+	return "library kernel at " + cuda::detail_::ptr_as_hex(handle);
+}
+
+inline ::std::string identify(library::handle_t library_handle, kernel::handle_t handle)
+{
+	return identify(handle) + " within " + library::detail_::identify(library_handle);
+}
+
+::std::string identify(const kernel_t &kernel);
+
+inline ::std::pair<cuda::kernel::handle_t, status_t> contextualize_in_current_context(
+	const kernel::handle_t& library_kernel_handle)
+{
+	cuda::kernel::handle_t contextualized_kernel_handle;
+	auto status = cuKernelGetFunction(&contextualized_kernel_handle, library_kernel_handle);
+	return {contextualized_kernel_handle, status};
+}
+
+inline cuda::kernel::handle_t contextualize(
+	const handle_t& kernel_handle,
+	const context::handle_t context_handle)
+{
+	CAW_SET_SCOPE_CONTEXT(context_handle);
+	auto handle_and_status = contextualize_in_current_context(kernel_handle);
+	throw_if_error_lazy(handle_and_status.second, "Failed placing " + identify(kernel_handle) + " in "
+		+ context::detail_::identify(context_handle));
+	return handle_and_status.first;
+}
+
+inline attribute_value_t get_attribute(
+	handle_t             library_kernel_handle,
+	device::id_t         device_id,
+	kernel::attribute_t  attribute)
+{
+	attribute_value_t value;
+	auto status = cuKernelGetAttribute(&value, attribute, library_kernel_handle, device_id);
+	throw_if_error_lazy(status, ::std::string("Failed getting attribute ")
+		+ cuda::kernel::detail_::attribute_name(attribute) + " for " + identify(library_kernel_handle)
+		+ " on " + device::detail_::identify(device_id));
+}
+
+inline attribute_value_t set_attribute(
+	kernel::handle_t     library_kernel_handle,
+	device::id_t         device_id,
+	kernel::attribute_t  attribute,
+	attribute_value_t    value)
+{
+	auto status = cuKernelSetAttribute(attribute, value, library_kernel_handle, device_id);
+	throw_if_error_lazy(status, ::std::string("Failed setting attribute ")
+								+ cuda::kernel::detail_::attribute_name(attribute) + " value to " + ::std::to_string(value)
+								+ " for " + identify(library_kernel_handle) + " on " + device::detail_::identify(device_id));
+}
+
+} // namespace detail
+
+attribute_value_t get_attribute(
+	const library::kernel_t&  library_kernel,
+	kernel::attribute_t       attribute,
+	const device_t&           device);
+
+inline void set_attribute(
+	const library::kernel_t&  library_kernel,
+	kernel::attribute_t       attribute,
+	const device_t&           device,
+	attribute_value_t         value);
+
+} // namespace kernel
+
+/**
+ * @brief A subclass of the @ref `kernel_t` interface for kernels being
+ * functions marked as __global__ in source files and compiled apriori.
+ */
+class kernel_t {
+public: // getters
+	kernel::handle_t handle() const noexcept { return handle_; }
+	library::handle_t library_handle() const noexcept { return library_handle_; }
+	library_t library() const noexcept { return library::detail_::wrap(library_handle_); }
+
+public: // type_conversions
+
+public: // non-mutators
+
+#if CUDA_VERSION >= 12300
+	/**
+	 * Return the kernel function name as registered within its library
+	 *
+	 * @note This may return a mangled name if the kernel function was not declared as having C linkage.
+	 */
+	const char* name() const
+	{
+		if (name_ != nullptr) { return name_; }
+		const char* result;
+		auto status = cuKernelGetName(&result, handle_);
+		throw_if_error_lazy(status, "Retrieving the name of " + kernel::detail_::identify(*this));
+		name_ = result;
+		return name_;
+	}
+#endif
+	cuda::kernel_t contextualize(const context_t& context) const;
+
+protected: // ctors & dtor
+	kernel_t(library::handle_t library_handle, kernel::handle_t handle)
+	:
+	library_handle_(library_handle), handle_(handle) {}
+
+public: // ctors & dtor
+	kernel_t(const kernel_t &) = default;
+	kernel_t(kernel_t&& other) = default;
+
+public: // friends
+	friend kernel_t kernel::detail_::wrap(library::handle_t, kernel::handle_t);
+
+protected: // data members
+	library::handle_t library_handle_;
+	kernel::handle_t handle_;
+	mutable const char* name_ { nullptr }; // The name is cached after having been retrieved for the first time
+}; // kernel_t
+
+namespace kernel {
+namespace detail_ {
+
+inline kernel_t wrap(library::handle_t library_handle, kernel::handle_t handle)
+{
+	return {library_handle, handle};
+}
+
+inline ::std::string identify(const kernel_t& library_kernel)
+{
+	return identify(library_kernel.library_handle(), library_kernel.handle());
+}
+
+} // namespace detail_
+
+inline kernel_t get(const library_t& library, const char* name)
+{
+	auto kernel_handle = library::detail_::get_kernel(library.handle(), name);
+	return kernel::detail_::wrap(library.handle(), kernel_handle);
+}
+
+} // namespace kernel
+
+} // namespace library
+
+inline library::kernel_t library_t::get_kernel(const char* name) const
+{
+	return library::kernel::get(*this, name);
+}
+
+inline library::kernel_t library_t::get_kernel(const ::std::string& name) const
+{
+	return get_kernel(name.c_str());
+}
+
+} // namespace cuda
+
+#endif // CUDA_API_WRAPPERS_IN_LIBRARY_KERNEL_HPP_
+
diff --git a/src/cuda/api/library.hpp b/src/cuda/api/library.hpp
index b8cc4eac..898666f1 100644
--- a/src/cuda/api/library.hpp
+++ b/src/cuda/api/library.hpp
@@ -11,6 +11,7 @@
 #if CUDA_VERSION >= 12000
 
 #include "module.hpp"
+#include "error.hpp"
 
 #if __cplusplus >= 201703L
 #include <filesystem>
@@ -19,7 +20,6 @@
 namespace cuda {
 
 ///@cond
-class device_t;
 class context_t;
 class module_t;
 class library_t;
@@ -74,10 +74,25 @@ library_t create(
 	bool                       code_is_preserved);
 ///@}
 
+
+namespace detail_ {
+
+inline kernel::handle_t get_kernel(handle_t library_handle, const char* name)
+{
+	library::kernel::handle_t kernel_handle;
+	auto status = cuLibraryGetKernel(&kernel_handle, library_handle, name);
+	throw_if_error_lazy(status, ::std::string{"Failed obtaining kernel "} + name
+		+ "' from " + library::detail_::identify(library_handle));
+	return kernel_handle;
+}
+
+} // namespace detail_
+
+inline kernel_t get_kernel(const library_t& library, const char* name);
+
 } // namespace library
 
 memory::region_t get_global(const context_t& context, const library_t& library, const char* name);
-kernel_t get_kernel(const context_t& context, const library_t& library, const char* name);
 memory::region_t get_managed_region(const library_t& library, const char* name);
 
 namespace module {
@@ -110,15 +125,8 @@ class library_t {
 	 * @return An enqueable kernel proxy object for the requested kernel,
 	 * in the current context.
 	 */
-	cuda::kernel_t get_kernel(const char* name) const
-	{
-		return cuda::get_kernel(context::current::get(), *this, name);
-	}
-
-	cuda::kernel_t get_kernel(const ::std::string& name) const
-	{
-		return get_kernel(name.c_str());
-	}
+	library::kernel_t get_kernel(const char* name) const;
+	library::kernel_t get_kernel(const ::std::string& name) const;
 
 	memory::region_t get_global(const char* name) const
 	{
@@ -196,22 +204,10 @@ inline memory::region_t get_global(const context_t& context, const library_t& li
 	// Note: Nothing is holding a PC refcount unit here!
 }
 
-// Implement other get's
+// More library item getters
+namespace library {
 
-inline kernel_t get_kernel(const context_t& context, const library_t& library, const char* name)
-{
-	CAW_SET_SCOPE_CONTEXT(context.handle());
-	library::kernel::handle_t new_handle;
-	auto status = cuLibraryGetKernel(&new_handle, library.handle(), name);
-	throw_if_error_lazy(status, ::std::string("Failed obtaining kernel '") + name
-								+ "' from " + library::detail_::identify(library));
-	kernel::handle_t new_proper_kernel_handle;
-	status = cuKernelGetFunction(&new_proper_kernel_handle, new_handle);
-	throw_if_error_lazy(status, ::std::string("Failed obtaining a context-associated kernel ")
-		+ "from kernel '" + name + "' in " +  library::detail_::identify(library));
-	return kernel::wrap(context.device_id(), context.handle(),
-		new_proper_kernel_handle, do_hold_primary_context_refcount_unit);
-}
+} // namespace library
 
 inline memory::region_t get_managed_region(const library_t& library, const char* name)
 {
@@ -225,6 +221,9 @@ inline memory::region_t get_managed_region(const library_t& library, const char*
 
 namespace module {
 
+/**
+ * Create an in-context module from the compiled code within a loaded library
+ */
 inline module_t create(const context_t& context, const library_t& library)
 {
 	CAW_SET_SCOPE_CONTEXT(context.handle());
@@ -241,6 +240,7 @@ inline module_t create(const context_t& context, const library_t& library)
 
 } // namespace module
 
+// I really have no idea what this does!
 inline void* get_unified_function(const context_t& context, const library_t& library, const char* symbol)
 {
 	CAW_SET_SCOPE_CONTEXT(context.handle());
diff --git a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
index e546430a..24278859 100644
--- a/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
+++ b/src/cuda/api/multi_wrapper_impls/kernel_launch.hpp
@@ -207,7 +207,8 @@ struct enqueue_launch_helper<kernel_t, KernelParameters...> {
 
 template<typename RawKernelFunction, typename... KernelParameters>
 void enqueue_launch(
-	::std::integral_constant<bool, false>, // Got a raw kernel function
+	::std::integral_constant<bool, false>, // Not a wrapped contextual kernel,
+	::std::integral_constant<bool, false>, // and not a library kernel, so it must be a raw kernel function
 	RawKernelFunction&&       kernel_function,
 	const stream_t&           stream,
 	launch_configuration_t    launch_configuration,
@@ -228,7 +229,8 @@ void enqueue_launch(
 
 template<typename Kernel, typename... KernelParameters>
 void enqueue_launch(
-::std::integral_constant<bool, true>, // a kernel wrapped in a kernel_t (sub)class
+	::std::integral_constant<bool, true>,  // a kernel wrapped in a kernel_t (sub)class
+	::std::integral_constant<bool, false>, // Not a library kernel
 	Kernel&&                kernel,
 	const stream_t&         stream,
 	launch_configuration_t  launch_configuration,
@@ -239,6 +241,23 @@ ::std::integral_constant<bool, true>, // a kernel wrapped in a kernel_t (sub)cla
 		::std::forward<KernelParameters>(parameters)...);
 }
 
+#if CUDA_VERSION >= 12000
+template<typename Kernel, typename... KernelParameters>
+void enqueue_launch(
+	::std::integral_constant<bool, false>, // Not a wrapped contextual kernel,
+	::std::integral_constant<bool, true>,  // but a library kernel
+	Kernel&&                kernel,
+	const stream_t&         stream,
+	launch_configuration_t  launch_configuration,
+	KernelParameters&&...   parameters)
+{
+	kernel_t contextualized = cuda::contextualize(kernel, stream.context());
+	enqueue_launch_helper<kernel_t, KernelParameters...> {}(
+		contextualized, stream, launch_configuration,
+		::std::forward<KernelParameters>(parameters)...);
+}
+#endif // CUDA_VERSION >= 12000
+
 } // namespace detail_
 
 template<typename Kernel, typename... KernelParameters>
@@ -253,8 +272,7 @@ inline void launch(
 	// Note: If Kernel is a kernel_t, and its associated device is different
 	// than the current device, the next call will fail:
 
-	enqueue_launch(kernel, stream, launch_configuration,
-		::std::forward<KernelParameters>(parameters)...);
+	enqueue_launch(kernel, stream, launch_configuration, ::std::forward<KernelParameters>(parameters)...);
 }
 
 template <typename SpanOfConstVoidPtrLike>
@@ -283,6 +301,19 @@ inline void launch_type_erased(
 		static_cast<const void**>(marshalled_arguments.data()));
 }
 
+#if CUDA_VERSION >= 12000
+template <typename SpanOfConstVoidPtrLike>
+void launch_type_erased(
+	const library::kernel_t&  kernel,
+	const stream_t&           stream,
+	launch_configuration_t    launch_configuration,
+	SpanOfConstVoidPtrLike    marshalled_arguments)
+{
+	auto contextualized = contextualize(kernel, stream.context());
+	launch_type_erased(contextualized, stream, launch_configuration, marshalled_arguments);
+}
+#endif // CUDA_VERSION >= 12000
+
 #if ! CAN_GET_APRIORI_KERNEL_HANDLE
 
 #if defined(__CUDACC__)
diff --git a/src/cuda/api/multi_wrapper_impls/library_kernel.hpp b/src/cuda/api/multi_wrapper_impls/library_kernel.hpp
new file mode 100644
index 00000000..4b4d4374
--- /dev/null
+++ b/src/cuda/api/multi_wrapper_impls/library_kernel.hpp
@@ -0,0 +1,50 @@
+/**
+ * @file
+ *
+ * @brief Implementations requiring the definitions of multiple CUDA entity proxy classes,
+ * and which regard (non-contextualized) library kernels.
+ */
+#pragma once
+#ifndef CUDA_API_WRAPPERS_MULTI_WRAPPER_LIBRARY_KERNEL_HPP
+#define CUDA_API_WRAPPERS_MULTI_WRAPPER_LIBRARY_KERNEL_HPP
+
+#include "kernel.hpp"
+#include "../library.hpp"
+
+namespace cuda {
+
+namespace library {
+
+namespace kernel {
+
+attribute_value_t get_attribute(
+	const library::kernel_t&  library_kernel,
+	kernel::attribute_t       attribute,
+	const device_t&           device)
+{
+	return detail_::get_attribute(library_kernel.handle(), device.id(), attribute);
+}
+
+inline void set_attribute(
+	const library::kernel_t&  library_kernel,
+	kernel::attribute_t       attribute,
+	const device_t&           device,
+	attribute_value_t         value)
+{
+	detail_::set_attribute(library_kernel.handle(), device.id(), attribute, value);
+}
+
+cuda::kernel_t contextualize(const kernel_t& kernel, const context_t& context)
+{
+	auto new_handle = detail_::contextualize(kernel.handle(), context.handle());
+	using cuda::kernel::wrap;
+	return wrap(context.device_id(), context.handle(), new_handle, do_not_hold_primary_context_refcount_unit);
+}
+
+} // namespace kernel
+
+} // namespace library
+
+} // namespace cuda
+
+#endif CUDA_API_WRAPPERS_MULTI_WRAPPER_LIBRARY_KERNEL_HPP