From d944ff3391dfbe69db453406bd0bbcb78716dee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Fri, 2 Aug 2024 19:01:13 +0100 Subject: [PATCH] Add support for command-buffer kernel updates - Updates the specification to add support for command-buffer kernel handle updates. - Adds new UR tests for this feature. - Adds an implementation for the Cuda and Hip adapters. - Changes the UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP enum to a new UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP which uses a bitfield instead of a boolean. - Changes the spec for urCommandBufferUpdateKernelLaunchExp and ur_exp_command_buffer_update_kernel_launch_desc_t to make it more intuitive with less complicated errors codes: - Passing a non-nullptr local work-group will now always update the command to use that value. - Passing a nullptr local work-group will now always keep the current command behaviour (either generated by the implementation or user-defined). - Passing zero to newWorkDim is now an error. --- include/ur_api.h | 83 +++- include/ur_ddi.h | 2 + include/ur_print.h | 8 + include/ur_print.hpp | 138 +++++- scripts/core/EXP-COMMAND-BUFFER.rst | 38 +- scripts/core/exp-command-buffer.yml | 82 ++- source/adapters/cuda/command_buffer.cpp | 200 +++++--- source/adapters/cuda/command_buffer.hpp | 14 +- source/adapters/cuda/device.cpp | 12 +- source/adapters/hip/command_buffer.cpp | 190 ++++--- source/adapters/hip/command_buffer.hpp | 14 +- source/adapters/hip/device.cpp | 20 +- source/adapters/level_zero/command_buffer.cpp | 34 +- source/adapters/level_zero/device.cpp | 42 +- .../level_zero/ur_interface_loader.hpp | 1 + source/adapters/level_zero/v2/api.cpp | 1 + source/adapters/mock/ur_mockddi.cpp | 15 +- source/adapters/native_cpu/command_buffer.cpp | 2 +- source/adapters/native_cpu/device.cpp | 4 +- source/adapters/opencl/command_buffer.cpp | 42 +- source/adapters/opencl/command_buffer.hpp | 11 +- source/adapters/opencl/common.cpp | 43 +- source/adapters/opencl/common.hpp | 5 +- source/adapters/opencl/device.cpp | 8 +- source/loader/layers/tracing/ur_trcddi.cpp | 19 +- source/loader/layers/validation/ur_valddi.cpp | 30 +- source/loader/loader.def.in | 1 + source/loader/loader.map.in | 1 + source/loader/ur_ldrddi.cpp | 33 +- source/loader/ur_libapi.cpp | 31 +- source/loader/ur_print.cpp | 8 + source/ur_api.cpp | 23 +- test/conformance/device_code/CMakeLists.txt | 1 + test/conformance/device_code/fill_usm_2d.cpp | 31 ++ .../exp_command_buffer/CMakeLists.txt | 13 +- .../exp_command_buffer/commands.cpp | 2 +- .../exp_command_buffer_adapter_cuda.match | 1 - .../exp_command_buffer_adapter_hip.match | 1 - ...command_buffer_adapter_level_zero_v2.match | 61 +-- ...xp_command_buffer_adapter_native_cpu.match | 28 +- .../conformance/exp_command_buffer/fixtures.h | 252 +++++++--- .../buffer_fill_kernel_update.cpp | 67 +-- .../buffer_saxpy_kernel_update.cpp | 10 +- .../{ => update}/invalid_update.cpp | 131 ++--- .../update/kernel_handle_update.cpp | 469 ++++++++++++++++++ .../{ => update}/ndrange_update.cpp | 198 ++++++-- .../{ => update}/usm_fill_kernel_update.cpp | 36 +- .../{ => update}/usm_saxpy_kernel_update.cpp | 16 +- test/conformance/testing/include/uur/raii.h | 6 + tools/urinfo/urinfo.hpp | 4 +- 50 files changed, 1831 insertions(+), 651 deletions(-) create mode 100644 test/conformance/device_code/fill_usm_2d.cpp rename test/conformance/exp_command_buffer/{ => update}/buffer_fill_kernel_update.cpp (91%) rename test/conformance/exp_command_buffer/{ => update}/buffer_saxpy_kernel_update.cpp (97%) rename test/conformance/exp_command_buffer/{ => update}/invalid_update.cpp (65%) create mode 100644 test/conformance/exp_command_buffer/update/kernel_handle_update.cpp rename test/conformance/exp_command_buffer/{ => update}/ndrange_update.cpp (56%) rename test/conformance/exp_command_buffer/{ => update}/usm_fill_kernel_update.cpp (93%) rename test/conformance/exp_command_buffer/{ => update}/usm_saxpy_kernel_update.cpp (96%) diff --git a/include/ur_api.h b/include/ur_api.h index c95829bafe..8efa9b88b4 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -1637,8 +1637,8 @@ typedef enum ur_device_info_t { ///< `EnqueueDeviceGlobalVariableRead` entry points. UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000, ///< [::ur_bool_t] Returns true if the device supports the use of ///< command-buffers. - UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001, ///< [::ur_bool_t] Returns true if the device supports updating the kernel - ///< commands in a command-buffer. + UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP = 0x1001, ///< [::ur_device_command_buffer_update_capability_flags_t] Command-buffer + ///< update capabilities of the device UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP = 0x1111, ///< [::ur_bool_t] return true if enqueue Cluster Launch is supported UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000, ///< [::ur_bool_t] returns true if the device supports the creation of ///< bindless images @@ -8167,6 +8167,27 @@ urBindlessImagesSignalExternalSemaphoreExp( #if !defined(__GNUC__) #pragma region command_buffer_(experimental) #endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief Device kernel execution capability +typedef uint32_t ur_device_command_buffer_update_capability_flags_t; +typedef enum ur_device_command_buffer_update_capability_flag_t { + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS = UR_BIT(0), ///< Device supports updating the kernel arguments in command-buffer + ///< commands. + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE = UR_BIT(1), ///< Device supports updating the local work-group size in command-buffer + ///< commands. + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE = UR_BIT(2), ///< Device supports updating the global work-group size in command-buffer + ///< commands. + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET = UR_BIT(3), ///< Device supports updating the global work offset in command-buffer + ///< commands. + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE = UR_BIT(4), ///< Device supports updating the kernel handle in command-buffer commands. + /// @cond + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_FORCE_UINT32 = 0x7fffffff + /// @endcond + +} ur_device_command_buffer_update_capability_flag_t; +/// @brief Bit Mask for validating ur_device_command_buffer_update_capability_flags_t +#define UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAGS_MASK 0xffffffe0 + /////////////////////////////////////////////////////////////////////////////// /// @brief Command-buffer query information type typedef enum ur_exp_command_buffer_info_t { @@ -8220,7 +8241,7 @@ typedef struct ur_exp_command_buffer_update_memobj_arg_desc_t { ///< ::UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_MEMOBJ_ARG_DESC const void *pNext; ///< [in][optional] pointer to extension-specific structure uint32_t argIndex; ///< [in] Argument index. - const ur_kernel_arg_mem_obj_properties_t *pProperties; ///< [in][optinal] Pointer to memory object properties. + const ur_kernel_arg_mem_obj_properties_t *pProperties; ///< [in][optional] Pointer to memory object properties. ur_mem_handle_t hNewMemObjArg; ///< [in][optional] Handle of memory object to set at argument index. } ur_exp_command_buffer_update_memobj_arg_desc_t; @@ -8232,7 +8253,7 @@ typedef struct ur_exp_command_buffer_update_pointer_arg_desc_t { ///< ::UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC const void *pNext; ///< [in][optional] pointer to extension-specific structure uint32_t argIndex; ///< [in] Argument index. - const ur_kernel_arg_pointer_properties_t *pProperties; ///< [in][optinal] Pointer to USM pointer properties. + const ur_kernel_arg_pointer_properties_t *pProperties; ///< [in][optional] Pointer to USM pointer properties. const void *pNewPointerArg; ///< [in][optional] USM pointer to memory location holding the argument ///< value to set at argument index. @@ -8246,7 +8267,7 @@ typedef struct ur_exp_command_buffer_update_value_arg_desc_t { const void *pNext; ///< [in][optional] pointer to extension-specific structure uint32_t argIndex; ///< [in] Argument index. uint32_t argSize; ///< [in] Argument size. - const ur_kernel_arg_value_properties_t *pProperties; ///< [in][optinal] Pointer to value properties. + const ur_kernel_arg_value_properties_t *pProperties; ///< [in][optional] Pointer to value properties. const void *pNewValueArg; ///< [in][optional] Argument value representing matching kernel arg type to ///< set at argument index. @@ -8258,6 +8279,11 @@ typedef struct ur_exp_command_buffer_update_kernel_launch_desc_t { ur_structure_type_t stype; ///< [in] type of this structure, must be ///< ::UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC const void *pNext; ///< [in][optional] pointer to extension-specific structure + ur_kernel_handle_t hNewKernel; ///< [in][optional] The new kernel handle. If this parameter is nullptr, + ///< the current kernel handle in `hCommand` + ///< will be used. If a kernel handle is passed, it must be a valid kernel + ///< alternative as defined in + ///< ::urCommandBufferAppendKernelLaunchExp. uint32_t numNewMemObjArgs; ///< [in] Length of pNewMemObjArgList. uint32_t numNewPointerArgs; ///< [in] Length of pNewPointerArgList. uint32_t numNewValueArgs; ///< [in] Length of pNewValueArgList. @@ -8269,15 +8295,25 @@ typedef struct ur_exp_command_buffer_update_kernel_launch_desc_t { const ur_exp_command_buffer_update_value_arg_desc_t *pNewValueArgList; ///< [in][optional][range(0, numNewValueArgs)] An array describing the new ///< kernel value arguments for the command. size_t *pNewGlobalWorkOffset; ///< [in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned - ///< values that describe the offset used to calculate the global ID. + ///< values that describe the offset used + ///< to calculate the global ID. If this parameter is nullptr, the current + ///< global work offset will be used. This parameter is required if + ///< `newWorkDim` is different from the current work dimensions + ///< in the command. size_t *pNewGlobalWorkSize; ///< [in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned - ///< values that describe the number of global work-items. + ///< values that describe the number of + ///< global work-items. If this parameter is nullptr, the current global + ///< work size in `hCommand` will be used. + ///< This parameter is required if `newWorkDim` is different from the + ///< current work dimensions in the command. size_t *pNewLocalWorkSize; ///< [in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned - ///< values that describe the number of work-items that make up a - ///< work-group. If newWorkDim is non-zero and pNewLocalWorkSize is - ///< nullptr, then runtime implementation will choose the work-group size. - ///< If newWorkDim is zero and pNewLocalWorkSize is nullptr, then the local - ///< work size is unchanged. + ///< values that describe the number of + ///< work-items that make up a work-group. If `pNewGlobalWorkSize` is set + ///< and `pNewLocalWorkSize` is nullptr, + ///< then the runtime implementation will choose the local work size. If + ///< `pNewGlobalWorkSize` is nullptr and + ///< `pNewLocalWorkSize` is nullptr, the current local work size in the + ///< command will be used. } ur_exp_command_buffer_update_kernel_launch_desc_t; @@ -8399,6 +8435,9 @@ urCommandBufferFinalizeExp( /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + `phKernelAlternatives == NULL && numKernelAlternatives > 0` +/// + `phKernelAlternatives != NULL && numKernelAlternatives == 0` +/// + If `phKernelAlternatives` contains `hKernel` /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` @@ -8412,7 +8451,16 @@ urCommandBufferAppendKernelLaunchExp( uint32_t workDim, ///< [in] Dimension of the kernel execution. const size_t *pGlobalWorkOffset, ///< [in] Offset to use when executing kernel. const size_t *pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. - const size_t *pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + const size_t *pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t *phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ///< [in][optional] A list of sync points that this command depends on. May ///< be ignored if command-buffer is in-order. @@ -8927,18 +8975,17 @@ urCommandBufferReleaseCommandExp( /// - ::UR_RESULT_ERROR_INVALID_OPERATION /// + If ::ur_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to. /// + If the command-buffer `hCommand` belongs to has not been finalized. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and different from the work-dim used on creation of `hCommand`. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value and `pUpdateKernelLaunch->pNewGlobalWorkSize` is NULL. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value when `hCommand` was created with a NULL local work size. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a NULL value when `hCommand` was created with a non-NULL local work size. /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT /// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX /// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `pUpdateKernelLaunch->newWorkDim < 1 || pUpdateKernelLaunch->newWorkDim > 3` /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + If `pUpdateKernelLaunch->hNewKernel` was not passed to the `hKernel` or `phKernelAlternatives` parameters of ::urCommandBufferAppendKernelLaunchExp when this command was created. +/// + If `pUpdateKernelLaunch->newWorkDim` is different from the current workDim in `hCommand` and, pUpdateKernelLaunch->pNewGlobalWorkSize, or pUpdateKernelLaunch->pNewGlobalWorkOffset are nullptr. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -11527,6 +11574,8 @@ typedef struct ur_command_buffer_append_kernel_launch_exp_params_t { const size_t **ppGlobalWorkOffset; const size_t **ppGlobalWorkSize; const size_t **ppLocalWorkSize; + uint32_t *pnumKernelAlternatives; + ur_kernel_handle_t **pphKernelAlternatives; uint32_t *pnumSyncPointsInWaitList; const ur_exp_command_buffer_sync_point_t **ppSyncPointWaitList; ur_exp_command_buffer_sync_point_t **ppSyncPoint; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index 834c659c13..d2f79f4515 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1932,6 +1932,8 @@ typedef ur_result_t(UR_APICALL *ur_pfnCommandBufferAppendKernelLaunchExp_t)( const size_t *, const size_t *, uint32_t, + ur_kernel_handle_t *, + uint32_t, const ur_exp_command_buffer_sync_point_t *, ur_exp_command_buffer_sync_point_t *, ur_exp_command_buffer_command_handle_t *); diff --git a/include/ur_print.h b/include/ur_print.h index c70e661fb1..54082d5330 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -970,6 +970,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintExpExternalSemaphoreDesc(const struct /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintExpImageCopyRegion(const struct ur_exp_image_copy_region_t params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_device_command_buffer_update_capability_flag_t enum +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintDeviceCommandBufferUpdateCapabilityFlags(enum ur_device_command_buffer_update_capability_flag_t value, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_exp_command_buffer_info_t enum /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 6e84ce97a5..417f33b818 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -197,6 +197,8 @@ inline ur_result_t printFlag(std::ostream &os, uint32_t template <> inline ur_result_t printFlag(std::ostream &os, uint32_t flag); +template <> +inline ur_result_t printFlag(std::ostream &os, uint32_t flag); template <> inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_exp_command_buffer_info_t value, size_t size); @@ -335,6 +337,7 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_external_mem_desc_t params); inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_external_semaphore_desc_t params); inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_image_copy_region_t params); +inline std::ostream &operator<<(std::ostream &os, enum ur_device_command_buffer_update_capability_flag_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_command_buffer_info_t value); inline std::ostream &operator<<(std::ostream &os, enum ur_exp_command_buffer_command_info_t value); inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_exp_command_buffer_desc_t params); @@ -2541,8 +2544,8 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: os << "UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP"; break; - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: - os << "UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP"; + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: + os << "UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP"; break; case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP: os << "UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP"; @@ -4049,15 +4052,16 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info os << ")"; } break; - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: { - const ur_bool_t *tptr = (const ur_bool_t *)ptr; - if (sizeof(ur_bool_t) > size) { - os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: { + const ur_device_command_buffer_update_capability_flags_t *tptr = (const ur_device_command_buffer_update_capability_flags_t *)ptr; + if (sizeof(ur_device_command_buffer_update_capability_flags_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_device_command_buffer_update_capability_flags_t) << ")"; return UR_RESULT_ERROR_INVALID_SIZE; } os << (const void *)(tptr) << " ("; - os << *tptr; + ur::details::printFlag(os, + *tptr); os << ")"; } break; @@ -9701,6 +9705,103 @@ inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_image_copy return os; } /////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_device_command_buffer_update_capability_flag_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, enum ur_device_command_buffer_update_capability_flag_t value) { + switch (value) { + case UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS: + os << "UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS"; + break; + case UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE: + os << "UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE"; + break; + case UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE: + os << "UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE"; + break; + case UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET: + os << "UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET"; + break; + case UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE: + os << "UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE"; + break; + default: + os << "unknown enumerator"; + break; + } + return os; +} + +namespace ur::details { +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_device_command_buffer_update_capability_flag_t flag +template <> +inline ur_result_t printFlag(std::ostream &os, uint32_t flag) { + uint32_t val = flag; + bool first = true; + + if ((val & UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS) == (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS) { + val ^= (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS; + } + + if ((val & UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE) == (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE) { + val ^= (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE; + } + + if ((val & UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE) == (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE) { + val ^= (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE; + } + + if ((val & UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET) == (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET) { + val ^= (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; + } + + if ((val & UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE) == (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE) { + val ^= (uint32_t)UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE; + } + if (val != 0) { + std::bitset<32> bits(val); + if (!first) { + os << " | "; + } + os << "unknown bit flags " << bits; + } else if (first) { + os << "0"; + } + return UR_RESULT_SUCCESS; +} +} // namespace ur::details +/////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_exp_command_buffer_info_t type /// @returns /// std::ostream & @@ -9953,6 +10054,12 @@ inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_command_bu ur::details::printStruct(os, (params.pNext)); + os << ", "; + os << ".hNewKernel = "; + + ur::details::printPtr(os, + (params.hNewKernel)); + os << ", "; os << ".numNewMemObjArgs = "; @@ -15951,6 +16058,23 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppLocalWorkSize)); + os << ", "; + os << ".numKernelAlternatives = "; + + os << *(params->pnumKernelAlternatives); + + os << ", "; + os << ".phKernelAlternatives = {"; + for (size_t i = 0; *(params->pphKernelAlternatives) != NULL && i < *params->pnumKernelAlternatives; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphKernelAlternatives))[i]); + } + os << "}"; + os << ", "; os << ".numSyncPointsInWaitList = "; diff --git a/scripts/core/EXP-COMMAND-BUFFER.rst b/scripts/core/EXP-COMMAND-BUFFER.rst index c23519cf67..78e7337397 100644 --- a/scripts/core/EXP-COMMAND-BUFFER.rst +++ b/scripts/core/EXP-COMMAND-BUFFER.rst @@ -144,8 +144,8 @@ were obtained from. // sync-point ${x}CommandBufferAppendKernelLaunchExp(hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, 1, &syncPoint, - nullptr, nullptr); + pLocalWorkSize, 0, nullptr, 1, + &syncPoint, nullptr, nullptr); Enqueueing Command-Buffers -------------------------------------------------------------------------------- @@ -167,13 +167,21 @@ Updating Command-Buffer Commands An adapter implementing the command-buffer experimental feature can optionally support updating the configuration of kernel commands recorded to a -command-buffer. Support for this is reported by returning true in the -${X}_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP query. +command-buffer. The attributes of kernel commands that can be updated are +device specific and can be queried using the +${X}_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP query. Updating kernel commands is done by passing the new kernel configuration to ${x}CommandBufferUpdateKernelLaunchExp along with the command handle of the kernel command to update. Configurations that can be changed are the -parameters to the kernel and the execution ND-Range. +kernel handle, the parameters to the kernel and the execution ND-Range. + +Kernel handles that might be used to update the kernel of a command, need +to be registered when the command is created. This can be done +using the ``phKernelAlternatives`` parameter of +${x}CommandBufferAppendKernelLaunchExp. The command can then be updated +to use the new kernel handle by passing it to +${x}CommandBufferUpdateKernelLaunchExp. .. parsed-literal:: @@ -187,12 +195,14 @@ parameters to the kernel and the execution ND-Range. ${x}CommandBufferCreateExp(hContext, hDevice, &desc, &hCommandBuffer); // Append a kernel command which has two buffer parameters, an input - // and an output. + // and an output. Register hNewKernel as an alternative kernel handle + // which can later be used to change the kernel handle associated + // with this command. ${x}_exp_command_buffer_command_handle_t hCommand; ${x}CommandBufferAppendKernelLaunchExp(hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, 0, nullptr, - nullptr, &hCommand); + pLocalWorkSize, 1, &hNewKernel, + 0, nullptr, nullptr, &hCommand); // Close the command-buffer before updating ${x}CommandBufferFinalizeExp(hCommandBuffer); @@ -220,6 +230,7 @@ parameters to the kernel and the execution ND-Range. ${x}_exp_command_buffer_update_kernel_launch_desc_t update { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + hNewKernel // hNewKernel 2, // numNewMemobjArgs 0, // numNewPointerArgs 0, // numNewValueArgs @@ -249,7 +260,13 @@ Enums ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * ${x}_device_info_t * ${X}_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP - * ${X}_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP + * ${X}_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP +* ${x}_device_command_buffer_update_capability_flags_t + * UPDATE_KERNEL_ARGUMENTS + * LOCAL_WORK_SIZE + * GLOBAL_WORK_SIZE + * GLOBAL_WORK_OFFSET + * KERNEL_HANDLE * ${x}_result_t * ${X}_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP * ${X}_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP @@ -340,6 +357,8 @@ Changelog +-----------+-------------------------------------------------------+ | 1.4 | Add function definitions for kernel command update | +-----------+-------------------------------------------------------+ +| 1.5 | Add support for updating kernel handles. | ++-----------+-------------------------------------------------------+ Contributors -------------------------------------------------------------------------------- @@ -348,3 +367,4 @@ Contributors * Ewan Crawford `ewan@codeplay.com `_ * Maxime France-Pillois `maxime.francepillois@codeplay.com `_ * Aaron Greig `aaron.greig@codeplay.com `_ +* Fábio Mestre `fabio.mestre@codeplay.com `_ diff --git a/scripts/core/exp-command-buffer.yml b/scripts/core/exp-command-buffer.yml index 72b4e63f74..09be3e2492 100644 --- a/scripts/core/exp-command-buffer.yml +++ b/scripts/core/exp-command-buffer.yml @@ -21,9 +21,31 @@ etors: - name: COMMAND_BUFFER_SUPPORT_EXP value: "0x1000" desc: "[$x_bool_t] Returns true if the device supports the use of command-buffers." - - name: COMMAND_BUFFER_UPDATE_SUPPORT_EXP + - name: COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP + desc: "[$x_device_command_buffer_update_capability_flags_t] Command-buffer update capabilities of the device" value: "0x1001" - desc: "[$x_bool_t] Returns true if the device supports updating the kernel commands in a command-buffer." +--- #-------------------------------------------------------------------------- +type: enum +desc: "Device kernel execution capability" +class: $xDevice +name: $x_device_command_buffer_update_capability_flags_t +etors: + - name: KERNEL_ARGUMENTS + value: "$X_BIT(0)" + desc: "Device supports updating the kernel arguments in command-buffer commands." + - name: LOCAL_WORK_SIZE + value: "$X_BIT(1)" + desc: "Device supports updating the local work-group size in command-buffer commands." + - name: GLOBAL_WORK_SIZE + value: "$X_BIT(2)" + desc: "Device supports updating the global work-group size in command-buffer commands." + - name: GLOBAL_WORK_OFFSET + value: "$X_BIT(3)" + desc: "Device supports updating the global work offset in command-buffer commands." + - name: KERNEL_HANDLE + value: "$X_BIT(4)" + desc: "Device supports updating the kernel handle in command-buffer commands." + --- #-------------------------------------------------------------------------- type: enum extend: true @@ -127,7 +149,7 @@ members: desc: "[in] Argument index." - type: "const ur_kernel_arg_mem_obj_properties_t *" name: pProperties - desc: "[in][optinal] Pointer to memory object properties." + desc: "[in][optional] Pointer to memory object properties." - type: $x_mem_handle_t name: hNewMemObjArg desc: "[in][optional] Handle of memory object to set at argument index." @@ -142,7 +164,7 @@ members: desc: "[in] Argument index." - type: "const ur_kernel_arg_pointer_properties_t *" name: pProperties - desc: "[in][optinal] Pointer to USM pointer properties." + desc: "[in][optional] Pointer to USM pointer properties." - type: "const void *" name: pNewPointerArg desc: "[in][optional] USM pointer to memory location holding the argument value to set at argument index." @@ -160,7 +182,7 @@ members: desc: "[in] Argument size." - type: "const ur_kernel_arg_value_properties_t *" name: pProperties - desc: "[in][optinal] Pointer to value properties." + desc: "[in][optional] Pointer to value properties." - type: "const void *" name: pNewValueArg desc: "[in][optional] Argument value representing matching kernel arg type to set at argument index." @@ -170,6 +192,12 @@ desc: "Descriptor type for updating a kernel launch command." base: $x_base_desc_t name: $x_exp_command_buffer_update_kernel_launch_desc_t members: + - type: $x_kernel_handle_t + name: hNewKernel + desc: | + [in][optional] The new kernel handle. If this parameter is nullptr, the current kernel handle in `hCommand` + will be used. If a kernel handle is passed, it must be a valid kernel alternative as defined in + $xCommandBufferAppendKernelLaunchExp. - type: uint32_t name: numNewMemObjArgs desc: "[in] Length of pNewMemObjArgList." @@ -193,13 +221,23 @@ members: desc: "[in][optional][range(0, numNewValueArgs)] An array describing the new kernel value arguments for the command." - type: "size_t*" name: pNewGlobalWorkOffset - desc: "[in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned values that describe the offset used to calculate the global ID." + desc: | + [in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned values that describe the offset used + to calculate the global ID. If this parameter is nullptr, the current global work offset will be used. This parameter is required if `newWorkDim` is different from the current work dimensions + in the command. - type: "size_t*" name: pNewGlobalWorkSize - desc: "[in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned values that describe the number of global work-items." + desc: | + [in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned values that describe the number of + global work-items. If this parameter is nullptr, the current global work size in `hCommand` will be used. + This parameter is required if `newWorkDim` is different from the current work dimensions in the command. - type: "size_t*" name: pNewLocalWorkSize - desc: "[in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned values that describe the number of work-items that make up a work-group. If newWorkDim is non-zero and pNewLocalWorkSize is nullptr, then runtime implementation will choose the work-group size. If newWorkDim is zero and pNewLocalWorkSize is nullptr, then the local work size is unchanged." + desc: | + [in][optional][range(0, newWorkDim)] Array of newWorkDim unsigned values that describe the number of + work-items that make up a work-group. If `pNewGlobalWorkSize` is set and `pNewLocalWorkSize` is nullptr, + then the runtime implementation will choose the local work size. If `pNewGlobalWorkSize` is nullptr and + `pNewLocalWorkSize` is nullptr, the current local work size in the command will be used. --- #-------------------------------------------------------------------------- type: typedef desc: "A value that identifies a command inside of a command-buffer, used for defining dependencies between commands in the same command-buffer." @@ -306,7 +344,16 @@ params: desc: "[in] Global work size to use when executing kernel." - type: "const size_t*" name: pLocalWorkSize - desc: "[in][optional] Local work size to use when executing kernel." + desc: "[in][optional] Local work size to use when executing kernel. If this parameter is nullptr, then a local work size will be generated by the implementation." + - type: uint32_t + name: "numKernelAlternatives" + desc: "[in] The number of kernel alternatives provided in phKernelAlternatives." + - type: $x_kernel_handle_t* + name: "phKernelAlternatives" + desc: | + [in][optional][range(0, numKernelAlternatives)] List of kernel handles that might be used to update the kernel in this + command after the command-buffer is finalized. The default kernel `hKernel` is implicitly marked as an alternative. It's + invalid to specify it as part of this list. - type: uint32_t name: numSyncPointsInWaitList desc: "[in] The number of sync points in the provided dependency list." @@ -325,7 +372,10 @@ returns: - $X_RESULT_ERROR_INVALID_KERNEL - $X_RESULT_ERROR_INVALID_WORK_DIMENSION - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE - - $X_RESULT_ERROR_INVALID_VALUE + - $X_RESULT_ERROR_INVALID_VALUE: + - "`phKernelAlternatives == NULL && numKernelAlternatives > 0`" + - "`phKernelAlternatives != NULL && numKernelAlternatives == 0`" + - "If `phKernelAlternatives` contains `hKernel`" - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP: - "`pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0`" @@ -909,25 +959,23 @@ params: - type: "const $x_exp_command_buffer_update_kernel_launch_desc_t*" name: pUpdateKernelLaunch desc: "[in] Struct defining how the kernel command is to be updated." - returns: - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: - "If update functionality is not supported by the device." - $X_RESULT_ERROR_INVALID_OPERATION: - "If $x_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to." - "If the command-buffer `hCommand` belongs to has not been finalized." - - "If `pUpdateKernellaunch->newWorkDim` is non-zero and different from the work-dim used on creation of `hCommand`." - - "If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value and `pUpdateKernelLaunch->pNewGlobalWorkSize` is NULL." - - "If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value when `hCommand` was created with a NULL local work size." - - "If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a NULL value when `hCommand` was created with a non-NULL local work size." - $X_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP - $X_RESULT_ERROR_INVALID_MEM_OBJECT - $X_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX - $X_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE - $X_RESULT_ERROR_INVALID_ENUMERATION - - $X_RESULT_ERROR_INVALID_WORK_DIMENSION + - $X_RESULT_ERROR_INVALID_WORK_DIMENSION: + - "`pUpdateKernelLaunch->newWorkDim < 1 || pUpdateKernelLaunch->newWorkDim > 3`" - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE - - $X_RESULT_ERROR_INVALID_VALUE + - $X_RESULT_ERROR_INVALID_VALUE: + - "If `pUpdateKernelLaunch->hNewKernel` was not passed to the `hKernel` or `phKernelAlternatives` parameters of $xCommandBufferAppendKernelLaunchExp when this command was created." + - "If `pUpdateKernelLaunch->newWorkDim` is different from the current workDim in `hCommand` and, pUpdateKernelLaunch->pNewGlobalWorkSize, or pUpdateKernelLaunch->pNewGlobalWorkOffset are nullptr." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 2fdb6b08a3..7c06aa91a0 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -76,9 +76,11 @@ ur_exp_command_buffer_command_handle_t_:: ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, CUgraphNode Node, CUDA_KERNEL_NODE_PARAMS Params, uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr, - const size_t *LocalWorkSizePtr) - : CommandBuffer(CommandBuffer), Kernel(Kernel), Node(Node), Params(Params), - WorkDim(WorkDim), RefCountInternal(1), RefCountExternal(1) { + const size_t *LocalWorkSizePtr, uint32_t NumKernelAlternatives, + ur_kernel_handle_t *KernelAlternatives) + : CommandBuffer(CommandBuffer), Kernel(Kernel), ValidKernelHandles(), + Node(Node), Params(Params), WorkDim(WorkDim), RefCountInternal(1), + RefCountExternal(1) { CommandBuffer->incrementInternalReferenceCount(); const size_t CopySize = sizeof(size_t) * WorkDim; @@ -96,6 +98,13 @@ ur_exp_command_buffer_command_handle_t_:: std::memset(GlobalWorkOffset + WorkDim, 0, ZeroSize); std::memset(GlobalWorkSize + WorkDim, 0, ZeroSize); } + + /* Add the default Kernel as a valid kernel handle for this command */ + ValidKernelHandles.insert(Kernel); + if (KernelAlternatives) { + ValidKernelHandles.insert(KernelAlternatives, + KernelAlternatives + NumKernelAlternatives); + } } /// Helper function for finding the Cuda Nodes associated with the @@ -344,6 +353,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint, @@ -354,6 +364,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + for (uint32_t i = 0; i < numKernelAlternatives; ++i) { + UR_ASSERT(phKernelAlternatives[i] != hKernel, + UR_RESULT_ERROR_INVALID_VALUE); + } + CUgraphNode GraphNode; std::vector DepsList; @@ -418,8 +433,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( } auto NewCommand = new ur_exp_command_buffer_command_handle_t_{ - hCommandBuffer, hKernel, GraphNode, NodeParams, - workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize}; + hCommandBuffer, hKernel, GraphNode, + NodeParams, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, numKernelAlternatives, + phKernelAlternatives}; NewCommand->incrementInternalReferenceCount(); hCommandBuffer->CommandHandles.push_back(NewCommand); @@ -847,51 +864,55 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( return commandHandleReleaseInternal(hCommand); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( - ur_exp_command_buffer_command_handle_t hCommand, - const ur_exp_command_buffer_update_kernel_launch_desc_t - *pUpdateKernelLaunch) { - // Update requires command-buffer to be finalized - ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer; - if (!CommandBuffer->CudaGraphExec) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } +/** + * Validates contents of the update command description. + * @param[in] Command The command which is being updated. + * @param[in] UpdateCommandDesc The update command description. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +validateCommandDesc(ur_exp_command_buffer_command_handle_t Command, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *UpdateCommandDesc) { + + auto CommandBuffer = Command->CommandBuffer; - // Update requires command-buffer to be created with update enabled - if (!CommandBuffer->IsUpdatable) { + // Update requires the command-buffer to be finalized and updatable. + if (!CommandBuffer->CudaGraphExec || !CommandBuffer->IsUpdatable) { return UR_RESULT_ERROR_INVALID_OPERATION; } - if (auto NewWorkDim = pUpdateKernelLaunch->newWorkDim) { - // Error if work dim changes - if (NewWorkDim != hCommand->WorkDim) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - // Error If Local size and not global size - if ((pUpdateKernelLaunch->pNewLocalWorkSize != nullptr) && - (pUpdateKernelLaunch->pNewGlobalWorkSize == nullptr)) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - // Error if local size non-nullptr and created with null - // or if local size nullptr and created with non-null - const bool IsNewLocalSizeNull = - pUpdateKernelLaunch->pNewLocalWorkSize == nullptr; - const bool IsOriginalLocalSizeNull = hCommand->isNullLocalSize(); + if (UpdateCommandDesc->newWorkDim != Command->WorkDim && + (!UpdateCommandDesc->pNewGlobalWorkOffset || + !UpdateCommandDesc->pNewGlobalWorkSize)) { + return UR_RESULT_ERROR_INVALID_VALUE; + } - if (IsNewLocalSizeNull ^ IsOriginalLocalSizeNull) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } + if (UpdateCommandDesc->hNewKernel && + !Command->ValidKernelHandles.count(UpdateCommandDesc->hNewKernel)) { + return UR_RESULT_ERROR_INVALID_VALUE; } + return UR_RESULT_SUCCESS; +} + +/** + * Updates the arguments of CommandDesc->hNewKernel + * @param[in] Device The device associated with the kernel being updated. + * @param[in] UpdateCommandDesc The update command description that contains the + * new kernel and its arguments. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +updateKernelArguments(ur_device_handle_t Device, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *UpdateCommandDesc) { - // Kernel corresponding to the command to update - ur_kernel_handle_t Kernel = hCommand->Kernel; + ur_kernel_handle_t NewKernel = UpdateCommandDesc->hNewKernel; // Update pointer arguments to the kernel - uint32_t NumPointerArgs = pUpdateKernelLaunch->numNewPointerArgs; + uint32_t NumPointerArgs = UpdateCommandDesc->numNewPointerArgs; const ur_exp_command_buffer_update_pointer_arg_desc_t *ArgPointerList = - pUpdateKernelLaunch->pNewPointerArgList; + UpdateCommandDesc->pNewPointerArgList; for (uint32_t i = 0; i < NumPointerArgs; i++) { const auto &PointerArgDesc = ArgPointerList[i]; uint32_t ArgIndex = PointerArgDesc.argIndex; @@ -899,7 +920,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( ur_result_t Result = UR_RESULT_SUCCESS; try { - Kernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue); + NewKernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue); } catch (ur_result_t Err) { Result = Err; return Result; @@ -907,9 +928,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( } // Update memobj arguments to the kernel - uint32_t NumMemobjArgs = pUpdateKernelLaunch->numNewMemObjArgs; + uint32_t NumMemobjArgs = UpdateCommandDesc->numNewMemObjArgs; const ur_exp_command_buffer_update_memobj_arg_desc_t *ArgMemobjList = - pUpdateKernelLaunch->pNewMemObjArgList; + UpdateCommandDesc->pNewMemObjArgList; for (uint32_t i = 0; i < NumMemobjArgs; i++) { const auto &MemobjArgDesc = ArgMemobjList[i]; uint32_t ArgIndex = MemobjArgDesc.argIndex; @@ -918,11 +939,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( ur_result_t Result = UR_RESULT_SUCCESS; try { if (ArgValue == nullptr) { - Kernel->setKernelArg(ArgIndex, 0, nullptr); + NewKernel->setKernelArg(ArgIndex, 0, nullptr); } else { - CUdeviceptr CuPtr = - std::get(ArgValue->Mem).getPtr(CommandBuffer->Device); - Kernel->setKernelArg(ArgIndex, sizeof(CUdeviceptr), (void *)&CuPtr); + CUdeviceptr CuPtr = std::get(ArgValue->Mem).getPtr(Device); + NewKernel->setKernelArg(ArgIndex, sizeof(CUdeviceptr), (void *)&CuPtr); } } catch (ur_result_t Err) { Result = Err; @@ -931,9 +951,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( } // Update value arguments to the kernel - uint32_t NumValueArgs = pUpdateKernelLaunch->numNewValueArgs; + uint32_t NumValueArgs = UpdateCommandDesc->numNewValueArgs; const ur_exp_command_buffer_update_value_arg_desc_t *ArgValueList = - pUpdateKernelLaunch->pNewValueArgList; + UpdateCommandDesc->pNewValueArgList; for (uint32_t i = 0; i < NumValueArgs; i++) { const auto &ValueArgDesc = ArgValueList[i]; uint32_t ArgIndex = ValueArgDesc.argIndex; @@ -941,54 +961,81 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( const void *ArgValue = ValueArgDesc.pNewValueArg; ur_result_t Result = UR_RESULT_SUCCESS; - try { - Kernel->setKernelArg(ArgIndex, ArgSize, ArgValue); + NewKernel->setKernelArg(ArgIndex, ArgSize, ArgValue); } catch (ur_result_t Err) { Result = Err; return Result; } } - // Set the updated ND range - const uint32_t NewWorkDim = pUpdateKernelLaunch->newWorkDim; - if (NewWorkDim != 0) { - UR_ASSERT(NewWorkDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(NewWorkDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - hCommand->WorkDim = NewWorkDim; + return UR_RESULT_SUCCESS; +} + +/** + * Updates the command buffer command with new values from the update + * description. + * @param[in] Command The command to be updated. + * @param[in] UpdateCommandDesc The update command description. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +updateCommand(ur_exp_command_buffer_command_handle_t Command, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *UpdateCommandDesc) { + + if (UpdateCommandDesc->hNewKernel) { + Command->Kernel = UpdateCommandDesc->hNewKernel; } - if (pUpdateKernelLaunch->pNewGlobalWorkOffset) { - hCommand->setGlobalOffset(pUpdateKernelLaunch->pNewGlobalWorkOffset); + if (UpdateCommandDesc->newWorkDim) { + Command->WorkDim = UpdateCommandDesc->newWorkDim; } - if (pUpdateKernelLaunch->pNewGlobalWorkSize) { - hCommand->setGlobalSize(pUpdateKernelLaunch->pNewGlobalWorkSize); + if (UpdateCommandDesc->pNewGlobalWorkOffset) { + Command->setGlobalOffset(UpdateCommandDesc->pNewGlobalWorkOffset); } - if (pUpdateKernelLaunch->pNewLocalWorkSize) { - hCommand->setLocalSize(pUpdateKernelLaunch->pNewLocalWorkSize); + if (UpdateCommandDesc->pNewGlobalWorkSize) { + Command->setGlobalSize(UpdateCommandDesc->pNewGlobalWorkSize); + if (!UpdateCommandDesc->pNewLocalWorkSize) { + Command->setNullLocalSize(); + } } - size_t *GlobalWorkOffset = hCommand->GlobalWorkOffset; - size_t *GlobalWorkSize = hCommand->GlobalWorkSize; + if (UpdateCommandDesc->pNewLocalWorkSize) { + Command->setLocalSize(UpdateCommandDesc->pNewLocalWorkSize); + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( + ur_exp_command_buffer_command_handle_t hCommand, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *pUpdateKernelLaunch) { - // If no worksize is provided make sure we pass nullptr to setKernelParams so + ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer; + + UR_CHECK_ERROR(validateCommandDesc(hCommand, pUpdateKernelLaunch)); + UR_CHECK_ERROR( + updateKernelArguments(CommandBuffer->Device, pUpdateKernelLaunch)); + UR_CHECK_ERROR(updateCommand(hCommand, pUpdateKernelLaunch)); + + // If no work-size is provided make sure we pass nullptr to setKernelParams so // it can guess the local work size. const bool ProvidedLocalSize = !hCommand->isNullLocalSize(); size_t *LocalWorkSize = ProvidedLocalSize ? hCommand->LocalWorkSize : nullptr; - uint32_t WorkDim = hCommand->WorkDim; // Set the number of threads per block to the number of threads per warp - // by default unless user has provided a better number + // by default unless user has provided a better number. size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - CUfunction CuFunc = Kernel->get(); - ur_context_handle_t Context = CommandBuffer->Context; - ur_device_handle_t Device = CommandBuffer->Device; - auto Result = setKernelParams(Context, Device, WorkDim, GlobalWorkOffset, - GlobalWorkSize, LocalWorkSize, Kernel, CuFunc, - ThreadsPerBlock, BlocksPerGrid); + CUfunction CuFunc = hCommand->Kernel->get(); + auto Result = setKernelParams( + CommandBuffer->Context, CommandBuffer->Device, hCommand->WorkDim, + hCommand->GlobalWorkOffset, hCommand->GlobalWorkSize, LocalWorkSize, + hCommand->Kernel, CuFunc, ThreadsPerBlock, BlocksPerGrid); if (Result != UR_RESULT_SUCCESS) { return Result; } @@ -1002,8 +1049,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDimX = ThreadsPerBlock[0]; Params.blockDimY = ThreadsPerBlock[1]; Params.blockDimZ = ThreadsPerBlock[2]; - Params.sharedMemBytes = Kernel->getLocalSize(); - Params.kernelParams = const_cast(Kernel->getArgIndices().data()); + Params.sharedMemBytes = hCommand->Kernel->getLocalSize(); + Params.kernelParams = + const_cast(hCommand->Kernel->getArgIndices().data()); CUgraphNode Node = hCommand->Node; CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec; diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp index 504095612b..a973389426 100644 --- a/source/adapters/cuda/command_buffer.hpp +++ b/source/adapters/cuda/command_buffer.hpp @@ -16,6 +16,7 @@ #include "logger/ur_logger.hpp" #include #include +#include // Trace an internal UR call #define UR_TRACE(Call) \ @@ -44,7 +45,8 @@ struct ur_exp_command_buffer_command_handle_t_ { ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, CUgraphNode Node, CUDA_KERNEL_NODE_PARAMS Params, uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr, - const size_t *LocalWorkSizePtr); + const size_t *LocalWorkSizePtr, uint32_t NumKernelAlternatives, + ur_kernel_handle_t *KernelAlternatives); void setGlobalOffset(const size_t *GlobalWorkOffsetPtr) { const size_t CopySize = sizeof(size_t) * WorkDim; @@ -73,6 +75,10 @@ struct ur_exp_command_buffer_command_handle_t_ { } } + void setNullLocalSize() noexcept { + std::memset(LocalWorkSize, 0, sizeof(size_t) * 3); + } + bool isNullLocalSize() const noexcept { const size_t Zeros[3] = {0, 0, 0}; return 0 == std::memcmp(LocalWorkSize, Zeros, sizeof(LocalWorkSize)); @@ -96,7 +102,13 @@ struct ur_exp_command_buffer_command_handle_t_ { } ur_exp_command_buffer_handle_t CommandBuffer; + + // The currently active kernel handle for this command. ur_kernel_handle_t Kernel; + + // Set of all the kernel handles that can be used when updating this command. + std::unordered_set ValidKernelHandles; + CUgraphNode Node; CUDA_KERNEL_NODE_PARAMS Params; diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index 6ced1e8e87..cf3082e7cf 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -1088,8 +1088,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: - return ReturnValue(static_cast(true)); + return ReturnValue(true); + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: { + ur_device_command_buffer_update_capability_flags_t UpdateCapabilities = + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE; + return ReturnValue(UpdateCapabilities); + } case UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP: { int Value = getAttribute(hDevice, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >= 9; diff --git a/source/adapters/hip/command_buffer.cpp b/source/adapters/hip/command_buffer.cpp index a76f3e12be..2b58c3df9c 100644 --- a/source/adapters/hip/command_buffer.cpp +++ b/source/adapters/hip/command_buffer.cpp @@ -78,7 +78,8 @@ ur_exp_command_buffer_command_handle_t_:: ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, hipGraphNode_t Node, hipKernelNodeParams Params, uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr, - const size_t *LocalWorkSizePtr) + const size_t *LocalWorkSizePtr, uint32_t NumKernelAlternatives, + ur_kernel_handle_t *KernelAlternatives) : CommandBuffer(CommandBuffer), Kernel(Kernel), Node(Node), Params(Params), WorkDim(WorkDim), RefCountInternal(1), RefCountExternal(1) { CommandBuffer->incrementInternalReferenceCount(); @@ -98,6 +99,13 @@ ur_exp_command_buffer_command_handle_t_:: std::memset(GlobalWorkOffset + WorkDim, 0, ZeroSize); std::memset(GlobalWorkSize + WorkDim, 0, ZeroSize); } + + /* Add the default Kernel as a valid kernel handle for this command */ + ValidKernelHandles.insert(Kernel); + if (KernelAlternatives) { + ValidKernelHandles.insert(KernelAlternatives, + KernelAlternatives + NumKernelAlternatives); + } } /// Helper function for finding the HIP Nodes associated with the commands in a @@ -312,6 +320,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint, @@ -321,9 +330,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( UR_RESULT_ERROR_INVALID_KERNEL); UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + for (uint32_t i = 0; i < numKernelAlternatives; ++i) { + UR_ASSERT(phKernelAlternatives[i] != hKernel, + UR_RESULT_ERROR_INVALID_VALUE); + } + hipGraphNode_t GraphNode; std::vector DepsList; @@ -388,8 +403,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( } auto NewCommand = new ur_exp_command_buffer_command_handle_t_{ - hCommandBuffer, hKernel, GraphNode, NodeParams, - workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize}; + hCommandBuffer, hKernel, GraphNode, + NodeParams, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, numKernelAlternatives, + phKernelAlternatives}; NewCommand->incrementInternalReferenceCount(); hCommandBuffer->CommandHandles.push_back(NewCommand); @@ -832,68 +849,72 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( return commandHandleReleaseInternal(hCommand); } -UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( - ur_exp_command_buffer_command_handle_t hCommand, - const ur_exp_command_buffer_update_kernel_launch_desc_t - *pUpdateKernelLaunch) { - // Update requires command-buffer to be finalized - ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer; - if (!CommandBuffer->HIPGraphExec) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } +/** + * Validates contents of the update command description. + * @param[in] Command The command which is being updated. + * @param[in] UpdateCommandDesc The update command description. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +validateCommandDesc(ur_exp_command_buffer_command_handle_t Command, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *UpdateCommandDesc) { + + auto CommandBuffer = Command->CommandBuffer; - // Update requires command-buffer to be created with update enabled - if (!CommandBuffer->IsUpdatable) { + // Update requires the command-buffer to be finalized and updatable. + if (!CommandBuffer->HIPGraphExec || !CommandBuffer->IsUpdatable) { return UR_RESULT_ERROR_INVALID_OPERATION; } - if (auto NewWorkDim = pUpdateKernelLaunch->newWorkDim) { - // Error if work dim changes - if (NewWorkDim != hCommand->WorkDim) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } + if (UpdateCommandDesc->newWorkDim != Command->WorkDim && + (!UpdateCommandDesc->pNewGlobalWorkOffset || + !UpdateCommandDesc->pNewGlobalWorkSize)) { + return UR_RESULT_ERROR_INVALID_VALUE; + } - // Error If Local size and not global size - if ((pUpdateKernelLaunch->pNewLocalWorkSize != nullptr) && - (pUpdateKernelLaunch->pNewGlobalWorkSize == nullptr)) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } + if (UpdateCommandDesc->hNewKernel && + !Command->ValidKernelHandles.count(UpdateCommandDesc->hNewKernel)) { + return UR_RESULT_ERROR_INVALID_VALUE; + } - // Error if local size non-nullptr and created with null - // or if local size nullptr and created with non-null - const bool IsNewLocalSizeNull = - pUpdateKernelLaunch->pNewLocalWorkSize == nullptr; - const bool IsOriginalLocalSizeNull = hCommand->isNullLocalSize(); + return UR_RESULT_SUCCESS; +} - if (IsNewLocalSizeNull ^ IsOriginalLocalSizeNull) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - } +/** + * Updates the arguments of CommandDesc->hNewKernel + * @param[in] Device The device associated with the kernel being updated. + * @param[in] UpdateCommandDesc The update command description that contains + * the new kernel and its arguments. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +updateKernelArguments(ur_device_handle_t Device, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *UpdateCommandDesc) { - // Kernel corresponding to the command to update - ur_kernel_handle_t Kernel = hCommand->Kernel; - ur_device_handle_t Device = CommandBuffer->Device; + ur_kernel_handle_t NewKernel = UpdateCommandDesc->hNewKernel; // Update pointer arguments to the kernel - uint32_t NumPointerArgs = pUpdateKernelLaunch->numNewPointerArgs; + uint32_t NumPointerArgs = UpdateCommandDesc->numNewPointerArgs; const ur_exp_command_buffer_update_pointer_arg_desc_t *ArgPointerList = - pUpdateKernelLaunch->pNewPointerArgList; + UpdateCommandDesc->pNewPointerArgList; for (uint32_t i = 0; i < NumPointerArgs; i++) { const auto &PointerArgDesc = ArgPointerList[i]; uint32_t ArgIndex = PointerArgDesc.argIndex; const void *ArgValue = PointerArgDesc.pNewPointerArg; try { - Kernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue); + NewKernel->setKernelArg(ArgIndex, sizeof(ArgValue), ArgValue); } catch (ur_result_t Err) { return Err; } } // Update memobj arguments to the kernel - uint32_t NumMemobjArgs = pUpdateKernelLaunch->numNewMemObjArgs; + uint32_t NumMemobjArgs = UpdateCommandDesc->numNewMemObjArgs; const ur_exp_command_buffer_update_memobj_arg_desc_t *ArgMemobjList = - pUpdateKernelLaunch->pNewMemObjArgList; + UpdateCommandDesc->pNewMemObjArgList; for (uint32_t i = 0; i < NumMemobjArgs; i++) { const auto &MemobjArgDesc = ArgMemobjList[i]; uint32_t ArgIndex = MemobjArgDesc.argIndex; @@ -901,10 +922,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( try { if (ArgValue == nullptr) { - Kernel->setKernelArg(ArgIndex, 0, nullptr); + NewKernel->setKernelArg(ArgIndex, 0, nullptr); } else { void *HIPPtr = std::get(ArgValue->Mem).getVoid(Device); - Kernel->setKernelArg(ArgIndex, sizeof(void *), (void *)&HIPPtr); + NewKernel->setKernelArg(ArgIndex, sizeof(void *), (void *)&HIPPtr); } } catch (ur_result_t Err) { return Err; @@ -912,9 +933,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( } // Update value arguments to the kernel - uint32_t NumValueArgs = pUpdateKernelLaunch->numNewValueArgs; + uint32_t NumValueArgs = UpdateCommandDesc->numNewValueArgs; const ur_exp_command_buffer_update_value_arg_desc_t *ArgValueList = - pUpdateKernelLaunch->pNewValueArgList; + UpdateCommandDesc->pNewValueArgList; for (uint32_t i = 0; i < NumValueArgs; i++) { const auto &ValueArgDesc = ArgValueList[i]; uint32_t ArgIndex = ValueArgDesc.argIndex; @@ -922,49 +943,79 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( const void *ArgValue = ValueArgDesc.pNewValueArg; try { - Kernel->setKernelArg(ArgIndex, ArgSize, ArgValue); + NewKernel->setKernelArg(ArgIndex, ArgSize, ArgValue); } catch (ur_result_t Err) { return Err; } } - // Set the updated ND range - const uint32_t NewWorkDim = pUpdateKernelLaunch->newWorkDim; - if (NewWorkDim != 0) { - UR_ASSERT(NewWorkDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(NewWorkDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - hCommand->WorkDim = NewWorkDim; + return UR_RESULT_SUCCESS; +} + +/** + * Updates the command buffer command with new values from the update + * description. + * @param[in] Command The command to be updated. + * @param[in] UpdateCommandDesc The update command description. + * @return UR_RESULT_SUCCESS or an error code on failure + */ +ur_result_t +updateCommand(ur_exp_command_buffer_command_handle_t Command, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *UpdateCommandDesc) { + + if (UpdateCommandDesc->hNewKernel) { + Command->Kernel = UpdateCommandDesc->hNewKernel; } - if (pUpdateKernelLaunch->pNewGlobalWorkOffset) { - hCommand->setGlobalOffset(pUpdateKernelLaunch->pNewGlobalWorkOffset); + if (UpdateCommandDesc->hNewKernel) { + Command->WorkDim = UpdateCommandDesc->newWorkDim; } - if (pUpdateKernelLaunch->pNewGlobalWorkSize) { - hCommand->setGlobalSize(pUpdateKernelLaunch->pNewGlobalWorkSize); + if (UpdateCommandDesc->pNewGlobalWorkOffset) { + Command->setGlobalOffset(UpdateCommandDesc->pNewGlobalWorkOffset); } - if (pUpdateKernelLaunch->pNewLocalWorkSize) { - hCommand->setLocalSize(pUpdateKernelLaunch->pNewLocalWorkSize); + if (UpdateCommandDesc->pNewGlobalWorkSize) { + Command->setGlobalSize(UpdateCommandDesc->pNewGlobalWorkSize); + if (!UpdateCommandDesc->pNewLocalWorkSize) { + Command->setNullLocalSize(); + } } - size_t *GlobalWorkOffset = hCommand->GlobalWorkOffset; - size_t *GlobalWorkSize = hCommand->GlobalWorkSize; + if (UpdateCommandDesc->pNewLocalWorkSize) { + Command->setLocalSize(UpdateCommandDesc->pNewLocalWorkSize); + } - // If no worksize is provided make sure we pass nullptr to setKernelParams so - // it can guess the local work size. + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( + ur_exp_command_buffer_command_handle_t hCommand, + const ur_exp_command_buffer_update_kernel_launch_desc_t + *pUpdateKernelLaunch) { + + ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer; + + UR_CHECK_ERROR(validateCommandDesc(hCommand, pUpdateKernelLaunch)); + UR_CHECK_ERROR( + updateKernelArguments(CommandBuffer->Device, pUpdateKernelLaunch)); + UR_CHECK_ERROR(updateCommand(hCommand, pUpdateKernelLaunch)); + + // If no worksize is provided make sure we pass nullptr to setKernelParams + // so it can guess the local work size. const bool ProvidedLocalSize = !hCommand->isNullLocalSize(); size_t *LocalWorkSize = ProvidedLocalSize ? hCommand->LocalWorkSize : nullptr; - uint32_t WorkDim = hCommand->WorkDim; // Set the number of threads per block to the number of threads per warp // by default unless user has provided a better number size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - hipFunction_t HIPFunc = Kernel->get(); - UR_CHECK_ERROR(setKernelParams(Device, WorkDim, GlobalWorkOffset, - GlobalWorkSize, LocalWorkSize, Kernel, HIPFunc, - ThreadsPerBlock, BlocksPerGrid)); + hipFunction_t HIPFunc = hCommand->Kernel->get(); + UR_CHECK_ERROR(setKernelParams( + CommandBuffer->Device, hCommand->WorkDim, hCommand->GlobalWorkOffset, + hCommand->GlobalWorkSize, LocalWorkSize, hCommand->Kernel, HIPFunc, + ThreadsPerBlock, BlocksPerGrid)); hipKernelNodeParams &Params = hCommand->Params; @@ -975,8 +1026,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Params.blockDim.x = ThreadsPerBlock[0]; Params.blockDim.y = ThreadsPerBlock[1]; Params.blockDim.z = ThreadsPerBlock[2]; - Params.sharedMemBytes = Kernel->getLocalSize(); - Params.kernelParams = const_cast(Kernel->getArgIndices().data()); + Params.sharedMemBytes = hCommand->Kernel->getLocalSize(); + Params.kernelParams = + const_cast(hCommand->Kernel->getArgIndices().data()); hipGraphNode_t Node = hCommand->Node; hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec; diff --git a/source/adapters/hip/command_buffer.hpp b/source/adapters/hip/command_buffer.hpp index d744a3544d..e162b8e640 100644 --- a/source/adapters/hip/command_buffer.hpp +++ b/source/adapters/hip/command_buffer.hpp @@ -15,6 +15,7 @@ #include "context.hpp" #include #include +#include // Trace an internal UR call #define UR_TRACE(Call) \ @@ -43,7 +44,8 @@ struct ur_exp_command_buffer_command_handle_t_ { ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, hipGraphNode_t Node, hipKernelNodeParams Params, uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr, const size_t *GlobalWorkSizePtr, - const size_t *LocalWorkSizePtr); + const size_t *LocalWorkSizePtr, uint32_t NumKernelAlternatives, + ur_kernel_handle_t *KernelAlternatives); void setGlobalOffset(const size_t *GlobalWorkOffsetPtr) { const size_t CopySize = sizeof(size_t) * WorkDim; @@ -72,6 +74,10 @@ struct ur_exp_command_buffer_command_handle_t_ { } } + void setNullLocalSize() noexcept { + std::memset(LocalWorkSize, 0, sizeof(size_t) * 3); + } + bool isNullLocalSize() const noexcept { const size_t Zeros[3] = {0, 0, 0}; return 0 == std::memcmp(LocalWorkSize, Zeros, sizeof(LocalWorkSize)); @@ -95,7 +101,13 @@ struct ur_exp_command_buffer_command_handle_t_ { } ur_exp_command_buffer_handle_t CommandBuffer; + + // The currently active kernel handle for this command. ur_kernel_handle_t Kernel; + + // Set of all the kernel handles that can be used when updating this command. + std::unordered_set ValidKernelHandles; + hipGraphNode_t Node; hipKernelNodeParams Params; diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp index 2bf6763046..be3188b50d 100644 --- a/source/adapters/hip/device.cpp +++ b/source/adapters/hip/device.cpp @@ -903,9 +903,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_IL_VERSION: case UR_DEVICE_INFO_ASYNC_BARRIER: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; - - case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: { + case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: { int DriverVersion = 0; UR_CHECK_ERROR(hipDriverGetVersion(&DriverVersion)); @@ -917,6 +915,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, const int CmdBufDriverMinVersion = 50530202; // ROCM 5.5.1 return ReturnValue(DriverVersion >= CmdBufDriverMinVersion); } + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: { + int DriverVersion = 0; + UR_CHECK_ERROR(hipDriverGetVersion(&DriverVersion)); + const int CmdBufDriverMinVersion = 50530202; // ROCM 5.5.1 + if (DriverVersion < CmdBufDriverMinVersion) { + return ReturnValue( + static_cast(0)); + } + ur_device_command_buffer_update_capability_flags_t UpdateCapabilities = + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE; + return ReturnValue(UpdateCapabilities); + } default: break; } diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 1bf4f26716..c084303d52 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -733,6 +733,8 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel, uint32_t WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + uint32_t /*numKernelAlternatives*/, + ur_kernel_handle_t * /*phKernelAlternatives*/, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *RetSyncPoint, @@ -1318,27 +1320,15 @@ ur_result_t validateCommandDesc( ->mutableCommandFlags; logger::debug("Mutable features supported by device {}", SupportedFeatures); - uint32_t Dim = CommandDesc->newWorkDim; - if (Dim != 0) { - // Error if work dim changes - if (Dim != Command->WorkDim) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - // Error If Local size and not global size - if ((CommandDesc->pNewLocalWorkSize != nullptr) && - (CommandDesc->pNewGlobalWorkSize == nullptr)) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - // Error if local size non-nullptr and created with null - // or if local size nullptr and created with non-null - const bool IsNewLocalSizeNull = CommandDesc->pNewLocalWorkSize == nullptr; - const bool IsOriginalLocalSizeNull = !Command->UserDefinedLocalSize; + // Kernel handle updates are not yet supported. + if (CommandDesc->hNewKernel != Command->Kernel) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } - if (IsNewLocalSizeNull ^ IsOriginalLocalSizeNull) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } + if (CommandDesc->newWorkDim != Command->WorkDim && + (!CommandDesc->pNewGlobalWorkOffset || + !CommandDesc->pNewGlobalWorkSize)) { + return UR_RESULT_ERROR_INVALID_VALUE; } // Check if new global offset is provided. @@ -1346,7 +1336,7 @@ ur_result_t validateCommandDesc( UR_ASSERT(!NewGlobalWorkOffset || (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET), UR_RESULT_ERROR_UNSUPPORTED_FEATURE); - if (NewGlobalWorkOffset && Dim > 0) { + if (NewGlobalWorkOffset) { if (!CommandBuffer->Context->getPlatform() ->ZeDriverGlobalOffsetExtensionFound) { logger::error("No global offset extension found on this driver"); @@ -1616,8 +1606,6 @@ ur_result_t urCommandBufferUpdateKernelLaunchExp( ur_exp_command_buffer_command_handle_t Command, const ur_exp_command_buffer_update_kernel_launch_desc_t *CommandDesc) { UR_ASSERT(Command->Kernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(CommandDesc->newWorkDim <= 3, - UR_RESULT_ERROR_INVALID_WORK_DIMENSION); // Lock command, kernel and command buffer for update. std::scoped_lock Guard( diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index e6cb650420..6ec537f4f5 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -994,20 +994,34 @@ ur_result_t urDeviceGetInfo( } case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: return ReturnValue(true); - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: { - // Update support requires being able to update kernel arguments and all - // aspects of the kernel NDRange. - const ze_mutable_command_exp_flags_t UpdateMask = - ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS | - ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT | - ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE | - ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET; - - const bool KernelArgUpdateSupport = - (Device->ZeDeviceMutableCmdListsProperties->mutableCommandFlags & - UpdateMask) == UpdateMask; - return ReturnValue(KernelArgUpdateSupport && - Device->Platform->ZeMutableCmdListExt.Supported); + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: { + const auto ZeMutableCommandFlags = + Device->ZeDeviceMutableCmdListsProperties->mutableCommandFlags; + + auto supportsFlags = [&](ze_mutable_command_exp_flags_t RequiredFlags) { + return (ZeMutableCommandFlags & RequiredFlags) == RequiredFlags; + }; + + ur_device_command_buffer_update_capability_flags_t UpdateCapabilities = 0; + if (supportsFlags(ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS)) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS; + } + /* These capabilities are bundled together because, when the user updates + * the global work-size, the implementation might have to generate a new + * local work-size. This would require both mutable command flags to be set + * even though only the global work-size was explicitly updated. */ + if (supportsFlags(ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT | + ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE)) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE; + } + if (supportsFlags(ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET)) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; + } + return ReturnValue(UpdateCapabilities); } case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: { // On L0 bindless images are supported. diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp index f95625dd5b..2b163f6749 100644 --- a/source/adapters/level_zero/ur_interface_loader.hpp +++ b/source/adapters/level_zero/ur_interface_loader.hpp @@ -557,6 +557,7 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint, diff --git a/source/adapters/level_zero/v2/api.cpp b/source/adapters/level_zero/v2/api.cpp index cd25f838fe..d8c7f92638 100644 --- a/source/adapters/level_zero/v2/api.cpp +++ b/source/adapters/level_zero/v2/api.cpp @@ -422,6 +422,7 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numKernelAlternatives, ur_kernel_handle_t *phKernelAlternatives, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint, diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index 20d9cc5bed..8185858486 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -8349,7 +8349,18 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t * pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. const size_t * - pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t + numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t * + phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t * @@ -8369,6 +8380,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( &pGlobalWorkOffset, &pGlobalWorkSize, &pLocalWorkSize, + &numKernelAlternatives, + &phKernelAlternatives, &numSyncPointsInWaitList, &pSyncPointWaitList, &pSyncPoint, diff --git a/source/adapters/native_cpu/command_buffer.cpp b/source/adapters/native_cpu/command_buffer.cpp index fde6c03b86..2c5e350860 100644 --- a/source/adapters/native_cpu/command_buffer.cpp +++ b/source/adapters/native_cpu/command_buffer.cpp @@ -49,7 +49,7 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t, ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, const size_t *, uint32_t, - const ur_exp_command_buffer_sync_point_t *, + ur_kernel_handle_t *, uint32_t, const ur_exp_command_buffer_sync_point_t *, ur_exp_command_buffer_sync_point_t *, ur_exp_command_buffer_command_handle_t *) { detail::ur::die("Experimental Command-buffer feature is not " diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp index 0061fae907..15dbed75c9 100644 --- a/source/adapters/native_cpu/device.cpp +++ b/source/adapters/native_cpu/device.cpp @@ -400,8 +400,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(false); case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: return ReturnValue(false); + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: + return ReturnValue( + static_cast(0)); case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: return ReturnValue(false); diff --git a/source/adapters/opencl/command_buffer.cpp b/source/adapters/opencl/command_buffer.cpp index 79454506ad..414442bb71 100644 --- a/source/adapters/opencl/command_buffer.cpp +++ b/source/adapters/opencl/command_buffer.cpp @@ -71,10 +71,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp( const bool IsUpdatable = pCommandBufferDesc ? pCommandBufferDesc->isUpdatable : false; - bool DeviceSupportsUpdate = false; + ur_device_command_buffer_update_capability_flags_t UpdateCapabilities; cl_device_id CLDevice = cl_adapter::cast(hDevice); - CL_RETURN_ON_FAILURE(deviceSupportsURCommandBufferKernelUpdate( - CLDevice, DeviceSupportsUpdate)); + CL_RETURN_ON_FAILURE( + getDeviceCommandBufferUpdateCapabilities(CLDevice, UpdateCapabilities)); + bool DeviceSupportsUpdate = UpdateCapabilities > 0; if (IsUpdatable && !DeviceSupportsUpdate) { return UR_RESULT_ERROR_INVALID_OPERATION; @@ -140,6 +141,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t /*numKernelAlternatives*/, + ur_kernel_handle_t * /*phKernelAlternatives*/, uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint, @@ -175,7 +178,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( try { auto URCommandHandle = std::make_unique( - hCommandBuffer, CommandHandle, workDim, pLocalWorkSize != nullptr); + hCommandBuffer, CommandHandle, hKernel, workDim, + pLocalWorkSize != nullptr); ur_exp_command_buffer_command_handle_t Handle = URCommandHandle.release(); hCommandBuffer->CommandHandles.push_back(Handle); if (phCommandHandle) { @@ -485,6 +489,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( const ur_exp_command_buffer_update_kernel_launch_desc_t *pUpdateKernelLaunch) { + // Kernel handle updates are not yet supported. + if (pUpdateKernelLaunch->hNewKernel != hCommand->Kernel) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + ur_exp_command_buffer_handle_t hCommandBuffer = hCommand->hCommandBuffer; cl_context CLContext = cl_adapter::cast(hCommandBuffer->hContext); @@ -497,27 +506,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( if (!hCommandBuffer->IsFinalized || !hCommandBuffer->IsUpdatable) return UR_RESULT_ERROR_INVALID_OPERATION; - if (cl_uint NewWorkDim = pUpdateKernelLaunch->newWorkDim) { - // Error if work dim changes - if (NewWorkDim != hCommand->WorkDim) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - // Error If Local size and not global size - if ((pUpdateKernelLaunch->pNewLocalWorkSize != nullptr) && - (pUpdateKernelLaunch->pNewGlobalWorkSize == nullptr)) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - - // Error if local size non-nullptr and created with null - // or if local size nullptr and created with non-null - const bool IsNewLocalSizeNull = - pUpdateKernelLaunch->pNewLocalWorkSize == nullptr; - const bool IsOriginalLocalSizeNull = !hCommand->UserDefinedLocalSize; - - if (IsNewLocalSizeNull ^ IsOriginalLocalSizeNull) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } + if (pUpdateKernelLaunch->newWorkDim != hCommand->WorkDim && + (!pUpdateKernelLaunch->pNewGlobalWorkOffset || + !pUpdateKernelLaunch->pNewGlobalWorkSize)) { + return UR_RESULT_ERROR_INVALID_OPERATION; } // Find the CL USM pointer arguments to the kernel to update diff --git a/source/adapters/opencl/command_buffer.hpp b/source/adapters/opencl/command_buffer.hpp index 4c39b1ad74..d8e975a3df 100644 --- a/source/adapters/opencl/command_buffer.hpp +++ b/source/adapters/opencl/command_buffer.hpp @@ -17,6 +17,8 @@ struct ur_exp_command_buffer_command_handle_t_ { ur_exp_command_buffer_handle_t hCommandBuffer; /// OpenCL command-handle. cl_mutable_command_khr CLMutableCommand; + /// Kernel associated with this command handle + ur_kernel_handle_t Kernel; /// Work-dimension the command was originally created with. cl_uint WorkDim; /// Set to true if the user set the local work size on command creation. @@ -31,11 +33,12 @@ struct ur_exp_command_buffer_command_handle_t_ { ur_exp_command_buffer_command_handle_t_( ur_exp_command_buffer_handle_t hCommandBuffer, - cl_mutable_command_khr CLMutableCommand, cl_uint WorkDim, - bool UserDefinedLocalSize) + cl_mutable_command_khr CLMutableCommand, ur_kernel_handle_t Kernel, + cl_uint WorkDim, bool UserDefinedLocalSize) : hCommandBuffer(hCommandBuffer), CLMutableCommand(CLMutableCommand), - WorkDim(WorkDim), UserDefinedLocalSize(UserDefinedLocalSize), - RefCountInternal(0), RefCountExternal(0) {} + Kernel(Kernel), WorkDim(WorkDim), + UserDefinedLocalSize(UserDefinedLocalSize), RefCountInternal(0), + RefCountExternal(0) {} uint32_t incrementInternalReferenceCount() noexcept { return ++RefCountInternal; diff --git a/source/adapters/opencl/common.cpp b/source/adapters/opencl/common.cpp index b0621acfe4..389a1426e2 100644 --- a/source/adapters/opencl/common.cpp +++ b/source/adapters/opencl/common.cpp @@ -116,8 +116,12 @@ ur_result_t getNativeHandle(void *URObj, ur_native_handle_t *NativeHandle) { return UR_RESULT_SUCCESS; } -cl_int deviceSupportsURCommandBufferKernelUpdate(cl_device_id Dev, - bool &Result) { +cl_int getDeviceCommandBufferUpdateCapabilities( + cl_device_id Dev, + ur_device_command_buffer_update_capability_flags_t &UpdateCapabilities) { + + UpdateCapabilities = 0; + size_t ExtSize = 0; CL_RETURN_ON_FAILURE( clGetDeviceInfo(Dev, CL_DEVICE_EXTENSIONS, 0, nullptr, &ExtSize)); @@ -129,21 +133,34 @@ cl_int deviceSupportsURCommandBufferKernelUpdate(cl_device_id Dev, std::string SupportedExtensions(ExtStr.c_str()); if (ExtStr.find("cl_khr_command_buffer_mutable_dispatch") == std::string::npos) { - Result = false; return CL_SUCCESS; } - // All the CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR capabilities must - // be supported by a device for UR update. - cl_mutable_dispatch_fields_khr mutable_capabilities; + cl_mutable_dispatch_fields_khr MutableCapabilities; CL_RETURN_ON_FAILURE(clGetDeviceInfo( Dev, CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR, - sizeof(mutable_capabilities), &mutable_capabilities, nullptr)); - const cl_mutable_dispatch_fields_khr required_caps = - CL_MUTABLE_DISPATCH_ARGUMENTS_KHR | - CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR | - CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR | CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR | - CL_MUTABLE_DISPATCH_EXEC_INFO_KHR; - Result = (mutable_capabilities & required_caps) == required_caps; + sizeof(MutableCapabilities), &MutableCapabilities, nullptr)); + + if (!(MutableCapabilities & CL_MUTABLE_DISPATCH_EXEC_INFO_KHR)) { + return CL_SUCCESS; + } + + if (MutableCapabilities & CL_MUTABLE_DISPATCH_ARGUMENTS_KHR) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS; + } + if (MutableCapabilities & CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE; + } + if (MutableCapabilities & CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE; + } + if (MutableCapabilities & CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR) { + UpdateCapabilities |= + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; + } + return CL_SUCCESS; } diff --git a/source/adapters/opencl/common.hpp b/source/adapters/opencl/common.hpp index 3533f5faaf..de8b766272 100644 --- a/source/adapters/opencl/common.hpp +++ b/source/adapters/opencl/common.hpp @@ -421,5 +421,6 @@ ur_result_t mapCLErrorToUR(cl_int Result); ur_result_t getNativeHandle(void *URObj, ur_native_handle_t *NativeHandle); -cl_int deviceSupportsURCommandBufferKernelUpdate(cl_device_id Dev, - bool &Result); +cl_int getDeviceCommandBufferUpdateCapabilities( + cl_device_id Dev, + ur_device_command_buffer_update_capability_flags_t &UpdateCapabilities); diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp index 6cdfb3a97d..25d1dd23d9 100644 --- a/source/adapters/opencl/device.cpp +++ b/source/adapters/opencl/device.cpp @@ -1085,12 +1085,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(ExtStr.find("cl_khr_command_buffer") != std::string::npos); } - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: { + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP: { cl_device_id Dev = cl_adapter::cast(hDevice); - bool Supported = false; + ur_device_command_buffer_update_capability_flags_t UpdateCapabilities = 0; CL_RETURN_ON_FAILURE( - deviceSupportsURCommandBufferKernelUpdate(Dev, Supported)); - return ReturnValue(Supported); + getDeviceCommandBufferUpdateCapabilities(Dev, UpdateCapabilities)); + return ReturnValue(UpdateCapabilities); } default: { return UR_RESULT_ERROR_INVALID_ENUMERATION; diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index c6e59174c9..c353059c3c 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -7127,7 +7127,18 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t * pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. const size_t * - pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t + numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t * + phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t * @@ -7152,6 +7163,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( &pGlobalWorkOffset, &pGlobalWorkSize, &pLocalWorkSize, + &numKernelAlternatives, + &phKernelAlternatives, &numSyncPointsInWaitList, &pSyncPointWaitList, &pSyncPoint, @@ -7166,8 +7179,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_result_t result = pfnAppendKernelLaunchExp( hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, - phCommand); + pLocalWorkSize, numKernelAlternatives, phKernelAlternatives, + numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, phCommand); getContext()->notify_end( UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP, diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index fb705dfc20..decf623ad5 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -8055,7 +8055,18 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t * pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. const size_t * - pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t + numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t * + phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t * @@ -8090,6 +8101,14 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } + if (phKernelAlternatives == NULL && numKernelAlternatives > 0) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + + if (phKernelAlternatives != NULL && numKernelAlternatives == 0) { + return UR_RESULT_ERROR_INVALID_VALUE; + } + if (pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0) { return UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP; } @@ -8106,8 +8125,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ur_result_t result = pfnAppendKernelLaunchExp( hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, - phCommand); + pLocalWorkSize, numKernelAlternatives, phKernelAlternatives, + numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, phCommand); return result; } @@ -8935,6 +8954,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( if (NULL == pUpdateKernelLaunch) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } + + if (pUpdateKernelLaunch->newWorkDim < 1 || + pUpdateKernelLaunch->newWorkDim > 3) { + return UR_RESULT_ERROR_INVALID_WORK_DIMENSION; + } } ur_result_t result = diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in index 5e628b4faf..63a5f1843d 100644 --- a/source/loader/loader.def.in +++ b/source/loader/loader.def.in @@ -235,6 +235,7 @@ EXPORTS urPrintContextSetExtendedDeleterParams urPrintDeviceAffinityDomainFlags urPrintDeviceBinary + urPrintDeviceCommandBufferUpdateCapabilityFlags urPrintDeviceCreateWithNativeHandleParams urPrintDeviceExecCapabilityFlags urPrintDeviceFpCapabilityFlags diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in index 18e4018aee..a4ca4a713f 100644 --- a/source/loader/loader.map.in +++ b/source/loader/loader.map.in @@ -235,6 +235,7 @@ urPrintContextSetExtendedDeleterParams; urPrintDeviceAffinityDomainFlags; urPrintDeviceBinary; + urPrintDeviceCommandBufferUpdateCapabilityFlags; urPrintDeviceCreateWithNativeHandleParams; urPrintDeviceExecCapabilityFlags; urPrintDeviceFpCapabilityFlags; diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index 26f55c071f..8527d622ee 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -7105,7 +7105,18 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t * pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. const size_t * - pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t + numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t * + phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t * @@ -7138,11 +7149,20 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( // convert loader handle to platform handle hKernel = reinterpret_cast(hKernel)->handle; + // convert loader handles to platform handles + auto phKernelAlternativesLocal = + std::vector(numKernelAlternatives); + for (size_t i = 0; i < numKernelAlternatives; ++i) { + phKernelAlternativesLocal[i] = + reinterpret_cast(phKernelAlternatives[i]) + ->handle; + } + // forward to device-platform result = pfnAppendKernelLaunchExp( hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, - phCommand); + pLocalWorkSize, numKernelAlternatives, phKernelAlternativesLocal.data(), + numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, phCommand); if (UR_RESULT_SUCCESS != result) { return result; @@ -7872,6 +7892,13 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( // Deal with any struct parameters that have handle members we need to convert. auto pUpdateKernelLaunchLocal = *pUpdateKernelLaunch; + if (pUpdateKernelLaunchLocal.hNewKernel) { + pUpdateKernelLaunchLocal.hNewKernel = + reinterpret_cast( + pUpdateKernelLaunchLocal.hNewKernel) + ->handle; + } + std::vector pUpdateKernelLaunchpNewMemObjArgList; for (uint32_t i = 0; i < pUpdateKernelLaunch->numNewMemObjArgs; i++) { diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 3ccc51133b..ceb108c961 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -7525,6 +7525,9 @@ ur_result_t UR_APICALL urCommandBufferFinalizeExp( /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + `phKernelAlternatives == NULL && numKernelAlternatives > 0` +/// + `phKernelAlternatives != NULL && numKernelAlternatives == 0` +/// + If `phKernelAlternatives` contains `hKernel` /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` @@ -7541,7 +7544,18 @@ ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t * pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. const size_t * - pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t + numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t * + phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t * @@ -7559,10 +7573,10 @@ ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnAppendKernelLaunchExp(hCommandBuffer, hKernel, workDim, - pGlobalWorkOffset, pGlobalWorkSize, - pLocalWorkSize, numSyncPointsInWaitList, - pSyncPointWaitList, pSyncPoint, phCommand); + return pfnAppendKernelLaunchExp( + hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numKernelAlternatives, phKernelAlternatives, + numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, phCommand); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -8301,18 +8315,17 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( /// - ::UR_RESULT_ERROR_INVALID_OPERATION /// + If ::ur_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to. /// + If the command-buffer `hCommand` belongs to has not been finalized. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and different from the work-dim used on creation of `hCommand`. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value and `pUpdateKernelLaunch->pNewGlobalWorkSize` is NULL. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value when `hCommand` was created with a NULL local work size. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a NULL value when `hCommand` was created with a non-NULL local work size. /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT /// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX /// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `pUpdateKernelLaunch->newWorkDim < 1 || pUpdateKernelLaunch->newWorkDim > 3` /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + If `pUpdateKernelLaunch->hNewKernel` was not passed to the `hKernel` or `phKernelAlternatives` parameters of ::urCommandBufferAppendKernelLaunchExp when this command was created. +/// + If `pUpdateKernelLaunch->newWorkDim` is different from the current workDim in `hCommand` and, pUpdateKernelLaunch->pNewGlobalWorkSize, or pUpdateKernelLaunch->pNewGlobalWorkOffset are nullptr. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index f9d510e95d..454dc6d436 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -980,6 +980,14 @@ urPrintExpImageCopyRegion(const struct ur_exp_image_copy_region_t params, return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintDeviceCommandBufferUpdateCapabilityFlags( + enum ur_device_command_buffer_update_capability_flag_t value, char *buffer, + const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << value; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintExpCommandBufferInfo(enum ur_exp_command_buffer_info_t value, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 3e024ede0f..401271d29e 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -6381,6 +6381,9 @@ ur_result_t UR_APICALL urCommandBufferFinalizeExp( /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + `phKernelAlternatives == NULL && numKernelAlternatives > 0` +/// + `phKernelAlternatives != NULL && numKernelAlternatives == 0` +/// + If `phKernelAlternatives` contains `hKernel` /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP /// + `pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0` @@ -6397,7 +6400,18 @@ ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t * pGlobalWorkSize, ///< [in] Global work size to use when executing kernel. const size_t * - pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. + pLocalWorkSize, ///< [in][optional] Local work size to use when executing kernel. If this + ///< parameter is nullptr, then a local work size will be generated by the + ///< implementation. + uint32_t + numKernelAlternatives, ///< [in] The number of kernel alternatives provided in + ///< phKernelAlternatives. + ur_kernel_handle_t * + phKernelAlternatives, ///< [in][optional][range(0, numKernelAlternatives)] List of kernel handles + ///< that might be used to update the kernel in this + ///< command after the command-buffer is finalized. The default kernel + ///< `hKernel` is implicitly marked as an alternative. It's + ///< invalid to specify it as part of this list. uint32_t numSyncPointsInWaitList, ///< [in] The number of sync points in the provided dependency list. const ur_exp_command_buffer_sync_point_t * @@ -7014,18 +7028,17 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( /// - ::UR_RESULT_ERROR_INVALID_OPERATION /// + If ::ur_exp_command_buffer_desc_t::isUpdatable was not set to true on creation of the command buffer `hCommand` belongs to. /// + If the command-buffer `hCommand` belongs to has not been finalized. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and different from the work-dim used on creation of `hCommand`. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value and `pUpdateKernelLaunch->pNewGlobalWorkSize` is NULL. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a non-NULL value when `hCommand` was created with a NULL local work size. -/// + If `pUpdateKernellaunch->newWorkDim` is non-zero and `pUpdateKernelLaunch->pNewLocalWorkSize` is set to a NULL value when `hCommand` was created with a non-NULL local work size. /// - ::UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_COMMAND_HANDLE_EXP /// - ::UR_RESULT_ERROR_INVALID_MEM_OBJECT /// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX /// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION +/// + `pUpdateKernelLaunch->newWorkDim < 1 || pUpdateKernelLaunch->newWorkDim > 3` /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// + If `pUpdateKernelLaunch->hNewKernel` was not passed to the `hKernel` or `phKernelAlternatives` parameters of ::urCommandBufferAppendKernelLaunchExp when this command was created. +/// + If `pUpdateKernelLaunch->newWorkDim` is different from the current workDim in `hCommand` and, pUpdateKernelLaunch->pNewGlobalWorkSize, or pUpdateKernelLaunch->pNewGlobalWorkOffset are nullptr. /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt index 5445531961..912402b7a5 100644 --- a/test/conformance/device_code/CMakeLists.txt +++ b/test/conformance/device_code/CMakeLists.txt @@ -141,6 +141,7 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_2d.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_3d.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_usm.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_usm_2d.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/foo.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/image_copy.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/inc.cpp) diff --git a/test/conformance/device_code/fill_usm_2d.cpp b/test/conformance/device_code/fill_usm_2d.cpp new file mode 100644 index 0000000000..2cfba67884 --- /dev/null +++ b/test/conformance/device_code/fill_usm_2d.cpp @@ -0,0 +1,31 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +int main() { + + size_t nd_range_x = 8; + size_t nd_range_y = 8; + + auto nd_range = sycl::range<2>(nd_range_x, nd_range_y); + + std::vector A(nd_range_x * nd_range_y, 1); + uint32_t val = 42; + sycl::queue sycl_queue; + + auto work_range = sycl::nd_range<2>(nd_range, sycl::range<2>(1, 1)); + + uint32_t *data = + sycl::malloc_shared(nd_range_x * nd_range_y, sycl_queue); + sycl_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + work_range, [data, val](sycl::nd_item<2> item_id) { + auto id = item_id.get_global_linear_id(); + data[id] = val; + }); + }); + return 0; +} diff --git a/test/conformance/exp_command_buffer/CMakeLists.txt b/test/conformance/exp_command_buffer/CMakeLists.txt index a28d692d9b..0162a2dfe3 100644 --- a/test/conformance/exp_command_buffer/CMakeLists.txt +++ b/test/conformance/exp_command_buffer/CMakeLists.txt @@ -4,14 +4,15 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception add_conformance_test_with_kernels_environment(exp_command_buffer - buffer_fill_kernel_update.cpp - usm_fill_kernel_update.cpp - buffer_saxpy_kernel_update.cpp - usm_saxpy_kernel_update.cpp - ndrange_update.cpp release.cpp retain.cpp - invalid_update.cpp commands.cpp fill.cpp + update/buffer_fill_kernel_update.cpp + update/invalid_update.cpp + update/kernel_handle_update.cpp + update/usm_fill_kernel_update.cpp + update/buffer_saxpy_kernel_update.cpp + update/ndrange_update.cpp + update/usm_saxpy_kernel_update.cpp ) diff --git a/test/conformance/exp_command_buffer/commands.cpp b/test/conformance/exp_command_buffer/commands.cpp index 53898ce60a..ae1508ac64 100644 --- a/test/conformance/exp_command_buffer/commands.cpp +++ b/test/conformance/exp_command_buffer/commands.cpp @@ -189,7 +189,7 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCommandBufferAppendKernelLaunchExpTest); TEST_P(urCommandBufferAppendKernelLaunchExpTest, Basic) { ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( cmd_buf_handle, kernel, n_dimensions, &global_offset, &global_size, - &local_size, 0, nullptr, nullptr, nullptr)); + &local_size, 0, nullptr, 0, nullptr, nullptr, nullptr)); ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match index 8b13789179..e69de29bb2 100644 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match @@ -1 +0,0 @@ - diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match index 8b13789179..e69de29bb2 100644 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match @@ -1 +0,0 @@ - diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match index c4787da327..b2833effcc 100644 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero_v2.match @@ -1,47 +1,20 @@ {{NONDETERMINISTIC}} -BufferFillCommandTest.UpdateParameters/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -BufferFillCommandTest.UpdateGlobalSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -BufferFillCommandTest.SeparateUpdateCalls/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -BufferFillCommandTest.OverrideUpdate/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -BufferFillCommandTest.OverrideArgList/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -USMFillCommandTest.UpdateParameters/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -USMFillCommandTest.UpdateBeforeEnqueue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -USMMultipleFillCommandTest.UpdateAllKernels/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -BufferSaxpyKernelTest.UpdateParameters/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -USMSaxpyKernelTest.UpdateParameters/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -USMMultiSaxpyKernelTest.UpdateParameters/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -USMMultiSaxpyKernelTest.UpdateWithoutBlocking/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -NDRangeUpdateTest.Update3D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -NDRangeUpdateTest.Update2D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -NDRangeUpdateTest.Update1D/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -NDRangeUpdateTest.Invalid/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferReleaseExpTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferReleaseExpTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferReleaseCommandExpTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferReleaseCommandExpTest.ReleaseCmdBufBeforeHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferReleaseCommandExpTest.ReleaseCmdBufMultipleHandles/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferReleaseCommandExpTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferRetainExpTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferRetainExpTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferRetainCommandExpTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferRetainCommandExpTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -InvalidUpdateTest.NotFinalizedCommandBuffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -InvalidUpdateTest.NotUpdatableCommandBuffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -InvalidUpdateTest.GlobalLocalSizeMistach/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -InvalidUpdateTest.ImplToUserDefinedLocalSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -InvalidUpdateTest.UserToImplDefinedLocalSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferCommandsTest.urCommandBufferAppendUSMMemcpyExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferCommandsTest.urCommandBufferAppendUSMFillExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferCommandsTest.urCommandBufferAppendMemBufferCopyExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferCommandsTest.urCommandBufferAppendMemBufferCopyRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferCommandsTest.urCommandBufferAppendMemBufferReadExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferCommandsTest.urCommandBufferAppendMemBufferReadRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferCommandsTest.urCommandBufferAppendMemBufferWriteExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferCommandsTest.urCommandBufferAppendMemBufferWriteRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferCommandsTest.urCommandBufferAppendMemBufferFillExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferCommandsTest.urCommandBufferAppendUSMPrefetchExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferCommandsTest.urCommandBufferAppendUSMAdviseExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urCommandBufferAppendKernelLaunchExpTest.Basic/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ +urCommandBufferReleaseExpTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferReleaseExpTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferRetainExpTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferRetainExpTest.InvalidNullHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendUSMMemcpyExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendUSMFillExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferCopyExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferCopyRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferReadExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferReadRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferWriteExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferWriteRectExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendMemBufferFillExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendUSMPrefetchExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferCommandsTest.urCommandBufferAppendUSMAdviseExp/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ +urCommandBufferAppendKernelLaunchExpTest.Basic/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___ urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__1__patternSize__1 urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__256 urCommandBufferFillCommandsTest.Buffer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__1024__patternSize__256 @@ -55,4 +28,4 @@ urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Z urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__4 urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__8 urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__16 -urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__32 +urCommandBufferFillCommandsTest.USM/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___size__256__patternSize__32 \ No newline at end of file diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match index eeb0aff5cf..74fdaf14ee 100644 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match @@ -1,9 +1,19 @@ {{NONDETERMINISTIC}} +{{OPT}}urCommandBufferReleaseCommandExpTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferReleaseCommandExpTest.ReleaseCmdBufBeforeHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferReleaseCommandExpTest.ReleaseCmdBufMultipleHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferReleaseCommandExpTest.InvalidNullHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferRetainCommandExpTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferRetainCommandExpTest.InvalidNullHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}urCommandBufferAppendKernelLaunchExpTest.Basic/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}BufferFillCommandTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}BufferFillCommandTest.UpdateGlobalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}BufferFillCommandTest.SeparateUpdateCalls/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}BufferFillCommandTest.OverrideUpdate/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}BufferFillCommandTest.OverrideArgList/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}InvalidUpdateTest.NotFinalizedCommandBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}InvalidUpdateTest.NotUpdatableCommandBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}InvalidUpdateTest.InvalidDimensions/SYCL_NATIVE_CPU___SYCL_Native_CPU__X_ {{OPT}}USMFillCommandTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}USMFillCommandTest.UpdateBeforeEnqueue/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}USMMultipleFillCommandTest.UpdateAllKernels/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} @@ -14,16 +24,8 @@ {{OPT}}NDRangeUpdateTest.Update3D/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}NDRangeUpdateTest.Update2D/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} {{OPT}}NDRangeUpdateTest.Update1D/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}NDRangeUpdateTest.Invalid/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferReleaseCommandExpTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferReleaseCommandExpTest.ReleaseCmdBufBeforeHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferReleaseCommandExpTest.ReleaseCmdBufMultipleHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferReleaseCommandExpTest.InvalidNullHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferRetainCommandExpTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferRetainCommandExpTest.InvalidNullHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}InvalidUpdateTest.NotFinalizedCommandBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}InvalidUpdateTest.NotUpdatableCommandBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}InvalidUpdateTest.GlobalLocalSizeMistach/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}InvalidUpdateTest.ImplToUserDefinedLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}InvalidUpdateTest.UserToImplDefinedLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} -{{OPT}}urCommandBufferAppendKernelLaunchExpTest.Basic/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}NDRangeUpdateTest.ImplToUserDefinedLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}NDRangeUpdateTest.UserToImplDefinedLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}USMSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}USMMultiSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} +{{OPT}}USMMultiSaxpyKernelTest.UpdateWithoutBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}} diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h index 9a38772eb7..442cbbc7f6 100644 --- a/test/conformance/exp_command_buffer/fixtures.h +++ b/test/conformance/exp_command_buffer/fixtures.h @@ -11,34 +11,49 @@ namespace uur { namespace command_buffer { -struct urCommandBufferExpTest : uur::urContextTest { - void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE(uur::urContextTest::SetUp()); +static void checkCommandBufferSupport(ur_device_handle_t device) { + size_t returned_size; + ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, 0, + nullptr, &returned_size)); - size_t returned_size; - ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, 0, - nullptr, &returned_size)); + std::unique_ptr returned_extensions(new char[returned_size]); - std::unique_ptr returned_extensions(new char[returned_size]); + ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, + returned_size, returned_extensions.get(), + nullptr)); - ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, - returned_size, returned_extensions.get(), - nullptr)); + std::string_view extensions_string(returned_extensions.get()); + bool command_buffer_support = + extensions_string.find(UR_COMMAND_BUFFER_EXTENSION_STRING_EXP) != + std::string::npos; - std::string_view extensions_string(returned_extensions.get()); - bool command_buffer_support = - extensions_string.find(UR_COMMAND_BUFFER_EXTENSION_STRING_EXP) != - std::string::npos; - - if (!command_buffer_support) { - GTEST_SKIP() << "EXP command-buffer feature is not supported."; - } + if (!command_buffer_support) { + GTEST_SKIP() << "EXP command-buffer feature is not supported."; + } +} + +static void checkCommandBufferUpdateSupport( + ur_device_handle_t device, + ur_device_command_buffer_update_capability_flags_t required_capabilities) { + ur_device_command_buffer_update_capability_flags_t update_capability_flags; + ASSERT_SUCCESS(urDeviceGetInfo( + device, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP, + sizeof(update_capability_flags), &update_capability_flags, nullptr)); + + if (!update_capability_flags) { + GTEST_SKIP() << "Updating EXP command-buffers is not supported."; + } else if ((update_capability_flags & required_capabilities) != + required_capabilities) { + GTEST_SKIP() << "Some of the command-buffer update capabilities " + "required are not supported by the device."; + } +} - ASSERT_SUCCESS(urDeviceGetInfo( - device, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP, - sizeof(ur_bool_t), &updatable_command_buffer_support, nullptr)); +struct urCommandBufferExpTest : uur::urContextTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(uur::urContextTest::SetUp()); - // Create a command-buffer + UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device)); ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr, &cmd_buf_handle)); ASSERT_NE(cmd_buf_handle, nullptr); @@ -52,7 +67,6 @@ struct urCommandBufferExpTest : uur::urContextTest { } ur_exp_command_buffer_handle_t cmd_buf_handle = nullptr; - ur_bool_t updatable_command_buffer_support = false; }; template @@ -60,26 +74,7 @@ struct urCommandBufferExpTestWithParam : urQueueTestWithParam { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTestWithParam::SetUp()); - size_t returned_size; - ASSERT_SUCCESS(urDeviceGetInfo(this->device, UR_DEVICE_INFO_EXTENSIONS, - 0, nullptr, &returned_size)); - - std::unique_ptr returned_extensions(new char[returned_size]); - - ASSERT_SUCCESS(urDeviceGetInfo(this->device, UR_DEVICE_INFO_EXTENSIONS, - returned_size, returned_extensions.get(), - nullptr)); - - std::string_view extensions_string(returned_extensions.get()); - bool command_buffer_support = - extensions_string.find(UR_COMMAND_BUFFER_EXTENSION_STRING_EXP) != - std::string::npos; - - if (!command_buffer_support) { - GTEST_SKIP() << "EXP command-buffer feature is not supported."; - } - - // Create a command-buffer + UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(this->device)); ASSERT_SUCCESS(urCommandBufferCreateExp(this->context, this->device, nullptr, &cmd_buf_handle)); ASSERT_NE(cmd_buf_handle, nullptr); @@ -99,33 +94,7 @@ struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp()); - ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, - sizeof(backend), &backend, nullptr)); - - size_t returned_size; - ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, 0, - nullptr, &returned_size)); - - std::unique_ptr returned_extensions(new char[returned_size]); - - ASSERT_SUCCESS(urDeviceGetInfo(device, UR_DEVICE_INFO_EXTENSIONS, - returned_size, returned_extensions.get(), - nullptr)); - - std::string_view extensions_string(returned_extensions.get()); - bool command_buffer_support = - extensions_string.find(UR_COMMAND_BUFFER_EXTENSION_STRING_EXP) != - std::string::npos; - - if (!command_buffer_support) { - GTEST_SKIP() << "EXP command-buffer feature is not supported."; - } - - ASSERT_SUCCESS(urDeviceGetInfo( - device, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP, - sizeof(ur_bool_t), &updatable_command_buffer_support, nullptr)); - - // Create a command-buffer + UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device)); ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, nullptr, &cmd_buf_handle)); ASSERT_NE(cmd_buf_handle, nullptr); @@ -139,18 +108,60 @@ struct urCommandBufferExpExecutionTest : uur::urKernelExecutionTest { } ur_exp_command_buffer_handle_t cmd_buf_handle = nullptr; - ur_bool_t updatable_command_buffer_support = false; - ur_platform_backend_t backend{}; }; -struct urUpdatableCommandBufferExpExecutionTest - : urCommandBufferExpExecutionTest { +struct urUpdatableCommandBufferExpTest : uur::urQueueTest { void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE(urCommandBufferExpExecutionTest ::SetUp()); + UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTest::SetUp()); + + UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device)); + + auto required_capabilities = + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; + UUR_RETURN_ON_FATAL_FAILURE( + checkCommandBufferUpdateSupport(device, required_capabilities)); + + // Create a command-buffer with update enabled. + ur_exp_command_buffer_desc_t desc{ + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, true}; - if (!updatable_command_buffer_support) { - GTEST_SKIP() << "Updating EXP command-buffers is not supported."; + ASSERT_SUCCESS(urCommandBufferCreateExp(context, device, &desc, + &updatable_cmd_buf_handle)); + ASSERT_NE(updatable_cmd_buf_handle, nullptr); + } + + void TearDown() override { + if (updatable_cmd_buf_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseExp(updatable_cmd_buf_handle)); } + UUR_RETURN_ON_FATAL_FAILURE(uur::urQueueTest::TearDown()); + } + + ur_exp_command_buffer_handle_t updatable_cmd_buf_handle = nullptr; + ur_platform_backend_t backend{}; +}; + +struct urUpdatableCommandBufferExpExecutionTest : uur::urKernelExecutionTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(uur::urKernelExecutionTest::SetUp()); + + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + + UUR_RETURN_ON_FATAL_FAILURE(checkCommandBufferSupport(device)); + auto required_capabilities = + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_ARGUMENTS | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_LOCAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_SIZE | + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_GLOBAL_WORK_OFFSET; + UUR_RETURN_ON_FATAL_FAILURE( + checkCommandBufferUpdateSupport(device, required_capabilities)); + + UUR_RETURN_ON_FATAL_FAILURE( + checkCommandBufferUpdateSupport(device, required_capabilities)); // Create a command-buffer with update enabled. ur_exp_command_buffer_desc_t desc{ @@ -166,10 +177,10 @@ struct urUpdatableCommandBufferExpExecutionTest if (updatable_cmd_buf_handle) { EXPECT_SUCCESS(urCommandBufferReleaseExp(updatable_cmd_buf_handle)); } - UUR_RETURN_ON_FATAL_FAILURE( - urCommandBufferExpExecutionTest::TearDown()); + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::TearDown()); } + ur_platform_backend_t backend{}; ur_exp_command_buffer_handle_t updatable_cmd_buf_handle = nullptr; }; @@ -182,12 +193,14 @@ struct urCommandBufferCommandExpTest // Append 2 kernel commands to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + &global_size, &local_size, 0, nullptr, 0, nullptr, nullptr, + &command_handle)); ASSERT_NE(command_handle, nullptr); ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle_2)); + &global_size, &local_size, 0, nullptr, 0, nullptr, nullptr, + &command_handle_2)); ASSERT_NE(command_handle_2, nullptr); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); @@ -214,6 +227,83 @@ struct urCommandBufferCommandExpTest ur_exp_command_buffer_command_handle_t command_handle = nullptr; ur_exp_command_buffer_command_handle_t command_handle_2 = nullptr; }; + +struct TestKernel { + + TestKernel(std::string Name, ur_platform_handle_t Platform, + ur_context_handle_t Context, ur_device_handle_t Device) + : Name(std::move(Name)), Platform(Platform), Context(Context), + Device(Device) {} + + virtual ~TestKernel() = default; + + virtual void buildKernel() { + std::shared_ptr> ILBinary; + std::vector Metadatas{}; + + ur_platform_backend_t Backend; + ASSERT_SUCCESS(urPlatformGetInfo(Platform, UR_PLATFORM_INFO_BACKEND, + sizeof(Backend), &Backend, nullptr)); + + ASSERT_NO_FATAL_FAILURE( + uur::KernelsEnvironment::instance->LoadSource(Name, ILBinary)); + + const ur_program_properties_t Properties = { + UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES, nullptr, + static_cast(Metadatas.size()), + Metadatas.empty() ? nullptr : Metadatas.data()}; + ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram( + Platform, Context, Device, *ILBinary, &Properties, &Program)); + + auto KernelNames = + uur::KernelsEnvironment::instance->GetEntryPointNames(Name); + std::string KernelName = KernelNames[0]; + ASSERT_FALSE(KernelName.empty()); + + ASSERT_SUCCESS(urProgramBuild(Context, Program, nullptr)); + ASSERT_SUCCESS(urKernelCreate(Program, KernelName.data(), &Kernel)); + } + + virtual void setUpKernel() = 0; + + virtual void destroyKernel() { + ASSERT_SUCCESS(urKernelRelease(Kernel)); + ASSERT_SUCCESS(urProgramRelease(Program)); + }; + + virtual void validate() = 0; + + std::string Name; + ur_platform_handle_t Platform; + ur_context_handle_t Context; + ur_device_handle_t Device; + ur_program_handle_t Program; + ur_kernel_handle_t Kernel; +}; + +struct urCommandBufferMultipleKernelUpdateTest + : uur::command_buffer::urUpdatableCommandBufferExpTest { + virtual void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urUpdatableCommandBufferExpTest::SetUp()); + } + + virtual void TearDown() override { + for (auto &TestKernel : TestKernels) { + UUR_RETURN_ON_FATAL_FAILURE(TestKernel->destroyKernel()); + } + UUR_RETURN_ON_FATAL_FAILURE( + urUpdatableCommandBufferExpTest::TearDown()); + } + + void setUpKernels() { + for (auto &TestKernel : TestKernels) { + UUR_RETURN_ON_FATAL_FAILURE(TestKernel->setUpKernel()); + } + } + + std::vector> TestKernels{}; +}; + } // namespace command_buffer } // namespace uur diff --git a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/update/buffer_fill_kernel_update.cpp similarity index 91% rename from test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp rename to test/conformance/exp_command_buffer/update/buffer_fill_kernel_update.cpp index 78e1ffd009..08be337466 100644 --- a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/update/buffer_fill_kernel_update.cpp @@ -3,7 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "fixtures.h" +#include "../fixtures.h" // Test that updating a command-buffer with a single kernel command // taking USM arguments works correctly. @@ -49,7 +49,8 @@ struct BufferFillCommandTest // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + &global_size, &local_size, 0, nullptr, 0, nullptr, nullptr, + &command_handle)); ASSERT_NE(command_handle, nullptr); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); @@ -72,7 +73,7 @@ struct BufferFillCommandTest static constexpr size_t local_size = 4; static constexpr size_t global_size = 32; static constexpr size_t global_offset = 0; - static constexpr size_t n_dimensions = 1; + static constexpr uint32_t n_dimensions = 1; static constexpr size_t buffer_size = sizeof(val) * global_size; ur_mem_handle_t buffer = nullptr; ur_mem_handle_t new_buffer = nullptr; @@ -123,10 +124,11 @@ TEST_P(BufferFillCommandTest, UpdateParameters) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 1, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim &new_output_desc, // pNewMemObjArgList nullptr, // pNewPointerArgList &new_input_desc, // pNewValueArgList @@ -175,10 +177,11 @@ TEST_P(BufferFillCommandTest, UpdateGlobalSize) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 1, // numNewMemObjArgs 0, // numNewPointerArgs 0, // numNewValueArgs - 1, // newWorkDim + n_dimensions, // newWorkDim &new_output_desc, // pNewMemObjArgList nullptr, // pNewPointerArgList nullptr, // pNewValueArgList @@ -225,10 +228,11 @@ TEST_P(BufferFillCommandTest, SeparateUpdateCalls) { ur_exp_command_buffer_update_kernel_launch_desc_t output_update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 1, // numNewMemObjArgs 0, // numNewPointerArgs 0, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim &new_output_desc, // pNewMemObjArgList nullptr, // pNewPointerArgList nullptr, // pNewValueArgList @@ -253,10 +257,11 @@ TEST_P(BufferFillCommandTest, SeparateUpdateCalls) { ur_exp_command_buffer_update_kernel_launch_desc_t input_update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList &new_input_desc, // pNewValueArgList @@ -271,16 +276,17 @@ TEST_P(BufferFillCommandTest, SeparateUpdateCalls) { ur_exp_command_buffer_update_kernel_launch_desc_t global_size_update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 0, // numNewValueArgs - static_cast(n_dimensions), // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - nullptr, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - &new_global_size, // pNewGlobalWorkSize - &new_local_size, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 0, // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + nullptr, // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + &new_global_size, // pNewGlobalWorkSize + &new_local_size, // pNewLocalWorkSize }; ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( @@ -315,10 +321,11 @@ TEST_P(BufferFillCommandTest, OverrideUpdate) { ur_exp_command_buffer_update_kernel_launch_desc_t first_update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList &first_input_desc, // pNewValueArgList @@ -342,10 +349,11 @@ TEST_P(BufferFillCommandTest, OverrideUpdate) { ur_exp_command_buffer_update_kernel_launch_desc_t second_update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList &second_input_desc, // pNewValueArgList @@ -398,16 +406,17 @@ TEST_P(BufferFillCommandTest, OverrideArgList) { ur_exp_command_buffer_update_kernel_launch_desc_t second_update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 2, // numNewValueArgs - 0, // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - input_descs, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - nullptr, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 2, // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + input_descs, // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize }; ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp(command_handle, diff --git a/test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp b/test/conformance/exp_command_buffer/update/buffer_saxpy_kernel_update.cpp similarity index 97% rename from test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp rename to test/conformance/exp_command_buffer/update/buffer_saxpy_kernel_update.cpp index 5a80be188f..f4b478a12f 100644 --- a/test/conformance/exp_command_buffer/buffer_saxpy_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/update/buffer_saxpy_kernel_update.cpp @@ -3,7 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "fixtures.h" +#include "../fixtures.h" #include // Test that updating a command-buffer with a single kernel command @@ -84,7 +84,8 @@ struct BufferSaxpyKernelTest // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + &global_size, &local_size, 0, nullptr, 0, nullptr, nullptr, + &command_handle)); ASSERT_NE(command_handle, nullptr); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); @@ -130,7 +131,7 @@ struct BufferSaxpyKernelTest static constexpr size_t local_size = 4; static constexpr size_t global_size = 32; static constexpr size_t global_offset = 0; - static constexpr size_t n_dimensions = 1; + static constexpr uint32_t n_dimensions = 1; static constexpr uint32_t A = 42; std::array buffers = {nullptr, nullptr, nullptr, nullptr}; @@ -184,10 +185,11 @@ TEST_P(BufferSaxpyKernelTest, UpdateParameters) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 2, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim new_input_descs, // pNewMemObjArgList nullptr, // pNewPointerArgList &new_A_desc, // pNewValueArgList diff --git a/test/conformance/exp_command_buffer/invalid_update.cpp b/test/conformance/exp_command_buffer/update/invalid_update.cpp similarity index 65% rename from test/conformance/exp_command_buffer/invalid_update.cpp rename to test/conformance/exp_command_buffer/update/invalid_update.cpp index afcb279fa9..101802de17 100644 --- a/test/conformance/exp_command_buffer/invalid_update.cpp +++ b/test/conformance/exp_command_buffer/update/invalid_update.cpp @@ -3,7 +3,8 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "fixtures.h" +#include "../fixtures.h" +#include #include // Negative tests that correct error codes are thrown on invalid update usage. @@ -36,7 +37,8 @@ struct InvalidUpdateTest // Append kernel command to command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + &global_size, &local_size, 0, nullptr, 0, nullptr, nullptr, + &command_handle)); ASSERT_NE(command_handle, nullptr); } @@ -64,7 +66,7 @@ struct InvalidUpdateTest static constexpr size_t local_size = 4; static constexpr size_t global_size = 32; static constexpr size_t global_offset = 0; - static constexpr size_t n_dimensions = 1; + static constexpr uint32_t n_dimensions = 1; static constexpr size_t allocation_size = sizeof(val) * global_size; void *shared_ptr = nullptr; ur_exp_command_buffer_command_handle_t command_handle = nullptr; @@ -89,10 +91,11 @@ TEST_P(InvalidUpdateTest, NotFinalizedCommandBuffer) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList &new_input_desc, // pNewValueArgList @@ -119,7 +122,7 @@ TEST_P(InvalidUpdateTest, NotUpdatableCommandBuffer) { ur_exp_command_buffer_command_handle_t test_command_handle = nullptr; EXPECT_SUCCESS(urCommandBufferAppendKernelLaunchExp( test_cmd_buf_handle, kernel, n_dimensions, &global_offset, &global_size, - &local_size, 0, nullptr, nullptr, &test_command_handle)); + &local_size, 0, nullptr, 0, nullptr, nullptr, &test_command_handle)); EXPECT_NE(test_command_handle, nullptr); EXPECT_SUCCESS(urCommandBufferFinalizeExp(test_cmd_buf_handle)); @@ -139,10 +142,11 @@ TEST_P(InvalidUpdateTest, NotUpdatableCommandBuffer) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList &new_input_desc, // pNewValueArgList @@ -165,98 +169,53 @@ TEST_P(InvalidUpdateTest, NotUpdatableCommandBuffer) { } } -// Test setting `pNewLocalWorkSize` to a non-NULL value and `pNewGlobalWorkSize` -// to NULL gives the correct error. -TEST_P(InvalidUpdateTest, GlobalLocalSizeMistach) { +// If the number of dimensions change, then the global work size and offset +// also need to be updated. +TEST_P(InvalidUpdateTest, InvalidDimensions) { ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); finalized = true; - size_t new_local_size = 16; - ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype - nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 0, // numNewValueArgs - n_dimensions, // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - nullptr, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - nullptr, // pNewGlobalWorkSize - &new_local_size, // pNewLocalWorkSize - }; - - // Update command local size but not global size - ur_result_t result = - urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc); - ASSERT_EQ(UR_RESULT_ERROR_INVALID_OPERATION, result); -} - -// Test setting `pNewLocalWorkSize` to a non-NULL value when the command was -// created with a NULL local work size gives the correct error. -TEST_P(InvalidUpdateTest, ImplToUserDefinedLocalSize) { - // Append kernel command to command-buffer using NULL local work size - ur_exp_command_buffer_command_handle_t second_command_handle = nullptr; - ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( - updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, nullptr, 0, nullptr, nullptr, &second_command_handle)); - ASSERT_NE(second_command_handle, nullptr); - - EXPECT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); - finalized = true; + uint32_t new_dimensions = 2; + std::array new_global_offset{0, 0}; + std::array new_global_size{64, 64}; - size_t new_global_size = 64; - size_t new_local_size = 16; ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 0, // numNewValueArgs - n_dimensions, // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - nullptr, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - &new_global_size, // pNewGlobalWorkSize - &new_local_size, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 0, // numNewValueArgs + new_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + nullptr, // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + new_global_size.data(), // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize }; - // Update command local size to non-NULL when created with NULL value - ur_result_t result = urCommandBufferUpdateKernelLaunchExp( - second_command_handle, &update_desc); - EXPECT_EQ(UR_RESULT_ERROR_INVALID_OPERATION, result); - - if (second_command_handle) { - EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(second_command_handle)); - } -} - -// Test setting `pNewLocalWorkSize` to a NULL value when the command was -// created with a non-NULL local work size gives the correct error. -TEST_P(InvalidUpdateTest, UserToImplDefinedLocalSize) { - ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); - finalized = true; + ASSERT_EQ( + UR_RESULT_ERROR_INVALID_VALUE, + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); - size_t new_global_size = 64; - ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 0, // numNewValueArgs - n_dimensions, // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - nullptr, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - &new_global_size, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 0, // numNewValueArgs + new_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + nullptr, // pNewValueArgList + new_global_offset.data(), // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize }; - // Update command local size to NULL when created with non-NULL value - ur_result_t result = - urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc); - ASSERT_EQ(UR_RESULT_ERROR_INVALID_OPERATION, result); + ASSERT_EQ( + UR_RESULT_ERROR_INVALID_VALUE, + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); } diff --git a/test/conformance/exp_command_buffer/update/kernel_handle_update.cpp b/test/conformance/exp_command_buffer/update/kernel_handle_update.cpp new file mode 100644 index 0000000000..bf432c4815 --- /dev/null +++ b/test/conformance/exp_command_buffer/update/kernel_handle_update.cpp @@ -0,0 +1,469 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "../fixtures.h" +#include "uur/raii.h" +#include +#include + +struct TestSaxpyKernel : public uur::command_buffer::TestKernel { + + TestSaxpyKernel(ur_platform_handle_t Platform, ur_context_handle_t Context, + ur_device_handle_t Device) + : TestKernel("saxpy_usm", Platform, Context, Device) {} + + ~TestSaxpyKernel() override = default; + + void setUpKernel() override { + + ASSERT_NO_FATAL_FAILURE(buildKernel()); + + const size_t AllocationSize = sizeof(uint32_t) * GlobalSize; + for (auto &SharedPtr : Allocations) { + ASSERT_SUCCESS(urUSMSharedAlloc(Context, Device, nullptr, nullptr, + AllocationSize, &SharedPtr)); + ASSERT_NE(SharedPtr, nullptr); + + std::vector pattern(AllocationSize); + uur::generateMemFillPattern(pattern); + std::memcpy(SharedPtr, pattern.data(), AllocationSize); + } + + // Index 0 is the output + ASSERT_SUCCESS( + urKernelSetArgPointer(Kernel, 0, nullptr, Allocations[0])); + // Index 1 is A + ASSERT_SUCCESS(urKernelSetArgValue(Kernel, 1, sizeof(A), nullptr, &A)); + // Index 2 is X + ASSERT_SUCCESS( + urKernelSetArgPointer(Kernel, 2, nullptr, Allocations[1])); + // Index 3 is Y + ASSERT_SUCCESS( + urKernelSetArgPointer(Kernel, 3, nullptr, Allocations[2])); + + UpdatePointerDesc[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 2, // argIndex + nullptr, // pProperties + &Allocations[0], // pArgValue + }; + + UpdatePointerDesc[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 2, // argIndex + nullptr, // pProperties + &Allocations[1], // pArgValue + }; + + UpdatePointerDesc[2] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 3, // argIndex + nullptr, // pProperties + &Allocations[2], // pArgValue + }; + + UpdateValDesc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1, // argIndex + sizeof(A), // argSize + nullptr, // pProperties + &A, // hArgValue + }; + + UpdateDesc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + Kernel, // hNewKernel + 0, // numNewMemObjArgs + 3, // numNewPointerArgs + 1, // numNewValueArgs + NDimensions, // newWorkDim + nullptr, // pNewMemObjArgList + UpdatePointerDesc.data(), // pNewPointerArgList + &UpdateValDesc, // pNewValueArgList + &GlobalOffset, // pNewGlobalWorkOffset + &GlobalSize, // pNewGlobalWorkSize + &LocalSize, // pNewLocalWorkSize + }; + } + + void destroyKernel() override { + for (auto &Allocation : Allocations) { + if (Allocation) { + EXPECT_SUCCESS(urUSMFree(Context, Allocation)); + } + } + ASSERT_NO_FATAL_FAILURE(TestKernel::destroyKernel()); + } + + void validate() override { + auto *output = static_cast(Allocations[0]); + auto *X = static_cast(Allocations[1]); + auto *Y = static_cast(Allocations[2]); + + for (size_t i = 0; i < GlobalSize; i++) { + uint32_t result = A * X[i] + Y[i]; + ASSERT_EQ(result, output[i]); + } + } + + std::array + UpdatePointerDesc; + ur_exp_command_buffer_update_value_arg_desc_t UpdateValDesc; + ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc; + + size_t LocalSize = 4; + size_t GlobalSize = 32; + size_t GlobalOffset = 0; + uint32_t NDimensions = 1; + uint32_t A = 42; + + std::array Allocations = {nullptr, nullptr, nullptr}; +}; + +struct TestFill2DKernel : public uur::command_buffer::TestKernel { + + TestFill2DKernel(ur_platform_handle_t Platform, ur_context_handle_t Context, + ur_device_handle_t Device) + : TestKernel("fill_usm_2d", Platform, Context, Device) {} + + ~TestFill2DKernel() override = default; + + void setUpKernel() override { + ASSERT_NO_FATAL_FAILURE(buildKernel()); + + const size_t allocation_size = sizeof(uint32_t) * SizeX * SizeY; + ASSERT_SUCCESS(urUSMSharedAlloc(Context, Device, nullptr, nullptr, + allocation_size, &Memory)); + + // Index 0 is the output + ASSERT_SUCCESS(urKernelSetArgPointer(Kernel, 0, nullptr, Memory)); + // Index 1 is the fill value + ASSERT_SUCCESS( + urKernelSetArgValue(Kernel, 1, sizeof(Val), nullptr, &Val)); + + ASSERT_NE(Memory, nullptr); + + std::vector pattern(allocation_size); + uur::generateMemFillPattern(pattern); + std::memcpy(Memory, pattern.data(), allocation_size); + + UpdatePointerDesc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + nullptr, // pProperties + &Memory, // pArgValue + }; + + UpdateValDesc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1, // argIndex + sizeof(Val), // argSize + nullptr, // pProperties + &Val, // hArgValue + }; + + UpdateDesc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + Kernel, // hNewKernel + 0, // numNewMemObjArgs + 1, // numNewPointerArgs + 1, // numNewValueArgs + NDimensions, // newWorkDim + nullptr, // pNewMemObjArgList + &UpdatePointerDesc, // pNewPointerArgList + &UpdateValDesc, // pNewValueArgList + GlobalOffset.data(), // pNewGlobalWorkOffset + GlobalSize.data(), // pNewGlobalWorkSize + LocalSize.data(), // pNewLocalWorkSize + }; + } + + void destroyKernel() override { + if (Memory) { + EXPECT_SUCCESS(urUSMFree(Context, Memory)); + } + ASSERT_NO_FATAL_FAILURE(TestKernel::destroyKernel()); + } + + void validate() override { + for (size_t i = 0; i < SizeX * SizeY; i++) { + ASSERT_EQ(static_cast(Memory)[i], Val); + } + } + + ur_exp_command_buffer_update_pointer_arg_desc_t UpdatePointerDesc; + ur_exp_command_buffer_update_value_arg_desc_t UpdateValDesc; + ur_exp_command_buffer_update_kernel_launch_desc_t UpdateDesc; + + std::vector LocalSize = {4, 4}; + const size_t SizeX = 64; + const size_t SizeY = 64; + std::vector GlobalSize = {SizeX, SizeY}; + std::vector GlobalOffset = {0, 0}; + uint32_t NDimensions = 2; + + void *Memory; + uint32_t Val = 42; +}; + +struct urCommandBufferKernelHandleUpdateTest + : uur::command_buffer::urCommandBufferMultipleKernelUpdateTest { + virtual void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::urCommandBufferMultipleKernelUpdateTest:: + SetUp()); + + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::checkCommandBufferUpdateSupport( + device, + UR_DEVICE_COMMAND_BUFFER_UPDATE_CAPABILITY_FLAG_KERNEL_HANDLE)); + + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + SaxpyKernel = std::make_shared( + TestSaxpyKernel(platform, context, device)); + FillUSM2DKernel = std::make_shared( + TestFill2DKernel(platform, context, device)); + TestKernels.push_back(SaxpyKernel); + TestKernels.push_back(FillUSM2DKernel); + + this->setUpKernels(); + } + + virtual void TearDown() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::command_buffer::urCommandBufferMultipleKernelUpdateTest:: + TearDown()); + } + + std::shared_ptr SaxpyKernel; + std::shared_ptr FillUSM2DKernel; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCommandBufferKernelHandleUpdateTest); + +/* Tests that it is possible to update the kernel handle of a command-buffer node. + * This test launches a Saxpy kernel using a command-buffer and then updates the + * node with a completely different kernel that does a fill 2D operation. */ +TEST_P(urCommandBufferKernelHandleUpdateTest, Success) { + + std::vector KernelAlternatives = { + FillUSM2DKernel->Kernel}; + + uur::raii::CommandBufferCommand CommandHandle; + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, SaxpyKernel->Kernel, SaxpyKernel->NDimensions, + &(SaxpyKernel->GlobalOffset), &(SaxpyKernel->GlobalSize), + &(SaxpyKernel->LocalSize), KernelAlternatives.size(), + KernelAlternatives.data(), 0, nullptr, nullptr, CommandHandle.ptr())); + ASSERT_NE(CommandHandle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &FillUSM2DKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_NO_FATAL_FAILURE(SaxpyKernel->validate()); + ASSERT_NO_FATAL_FAILURE(FillUSM2DKernel->validate()); +} + +/* Test that updates to the command kernel handle are stored in the command handle */ +TEST_P(urCommandBufferKernelHandleUpdateTest, UpdateAgain) { + + std::vector KernelAlternatives = { + FillUSM2DKernel->Kernel}; + + uur::raii::CommandBufferCommand CommandHandle; + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, SaxpyKernel->Kernel, SaxpyKernel->NDimensions, + &(SaxpyKernel->GlobalOffset), &(SaxpyKernel->GlobalSize), + &(SaxpyKernel->LocalSize), KernelAlternatives.size(), + KernelAlternatives.data(), 0, nullptr, nullptr, CommandHandle.ptr())); + ASSERT_NE(CommandHandle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &FillUSM2DKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_NO_FATAL_FAILURE(SaxpyKernel->validate()); + ASSERT_NO_FATAL_FAILURE(FillUSM2DKernel->validate()); + + // If the Kernel was not stored properly in the command, then this could potentially fail since + // it would try to use the Saxpy kernel + FillUSM2DKernel->Val = 78; + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &FillUSM2DKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ASSERT_NO_FATAL_FAILURE(FillUSM2DKernel->validate()); +} + +/* Test that it is possible to change the kernel handle in a command and later restore it to the original handle */ +TEST_P(urCommandBufferKernelHandleUpdateTest, RestoreOriginalKernel) { + + std::vector KernelAlternatives = { + FillUSM2DKernel->Kernel}; + + uur::raii::CommandBufferCommand CommandHandle; + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, SaxpyKernel->Kernel, SaxpyKernel->NDimensions, + &(SaxpyKernel->GlobalOffset), &(SaxpyKernel->GlobalSize), + &(SaxpyKernel->LocalSize), KernelAlternatives.size(), + KernelAlternatives.data(), 0, nullptr, nullptr, CommandHandle.ptr())); + ASSERT_NE(CommandHandle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &FillUSM2DKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_NO_FATAL_FAILURE(SaxpyKernel->validate()); + ASSERT_NO_FATAL_FAILURE(FillUSM2DKernel->validate()); + + // Updating A, so that the second launch of the saxpy kernel actually has a different output. + SaxpyKernel->A = 20; + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &SaxpyKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ASSERT_NO_FATAL_FAILURE(SaxpyKernel->validate()); +} + +TEST_P(urCommandBufferKernelHandleUpdateTest, KernelAlternativeNotRegistered) { + + uur::raii::CommandBufferCommand CommandHandle; + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, SaxpyKernel->Kernel, SaxpyKernel->NDimensions, + &(SaxpyKernel->GlobalOffset), &(SaxpyKernel->GlobalSize), + &(SaxpyKernel->LocalSize), 0, nullptr, 0, nullptr, nullptr, + CommandHandle.ptr())); + ASSERT_NE(CommandHandle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_VALUE, + urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &FillUSM2DKernel->UpdateDesc)); +} + +TEST_P(urCommandBufferKernelHandleUpdateTest, + RegisterInvalidKernelAlternative) { + + std::vector KernelAlternatives = {SaxpyKernel->Kernel}; + + ur_exp_command_buffer_command_handle_t CommandHandle; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_VALUE, + urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, SaxpyKernel->Kernel, + SaxpyKernel->NDimensions, &(SaxpyKernel->GlobalOffset), + &(SaxpyKernel->GlobalSize), &(SaxpyKernel->LocalSize), + KernelAlternatives.size(), KernelAlternatives.data(), + 0, nullptr, nullptr, &CommandHandle)); +} + +using urCommandBufferValidUpdateParametersTest = + urCommandBufferKernelHandleUpdateTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urCommandBufferValidUpdateParametersTest); + +// Test that updating the dimensions of a kernel command does not cause an error. +TEST_P(urCommandBufferValidUpdateParametersTest, + UpdateDimensionsWithoutUpdatingKernel) { + + uur::raii::CommandBufferCommand CommandHandle; + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, FillUSM2DKernel->Kernel, + FillUSM2DKernel->NDimensions, FillUSM2DKernel->GlobalOffset.data(), + FillUSM2DKernel->GlobalSize.data(), FillUSM2DKernel->LocalSize.data(), + 0, nullptr, 0, nullptr, nullptr, CommandHandle.ptr())); + ASSERT_NE(CommandHandle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_NO_FATAL_FAILURE(FillUSM2DKernel->validate()); + + size_t newGlobalWorkSize = + FillUSM2DKernel->GlobalSize[0] * FillUSM2DKernel->GlobalSize[1]; + size_t newGlobalWorkOffset = 0; + + // Since the fill2D kernel relies on the globalID, it will still work if we + // change the work dimensions to 1. + FillUSM2DKernel->UpdateDesc.newWorkDim = 1; + FillUSM2DKernel->UpdateDesc.pNewGlobalWorkSize = &newGlobalWorkSize; + FillUSM2DKernel->UpdateDesc.pNewGlobalWorkOffset = &newGlobalWorkOffset; + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &FillUSM2DKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_NO_FATAL_FAILURE(FillUSM2DKernel->validate()); +} + +// Test that updating only the local work size does not cause an error. +TEST_P(urCommandBufferValidUpdateParametersTest, UpdateOnlyLocalWorkSize) { + + std::vector KernelAlternatives = { + FillUSM2DKernel->Kernel}; + + uur::raii::CommandBufferCommand CommandHandle; + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, SaxpyKernel->Kernel, SaxpyKernel->NDimensions, + &(SaxpyKernel->GlobalOffset), &(SaxpyKernel->GlobalSize), + &(SaxpyKernel->LocalSize), KernelAlternatives.size(), + KernelAlternatives.data(), 0, nullptr, nullptr, CommandHandle.ptr())); + ASSERT_NE(CommandHandle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + SaxpyKernel->UpdateDesc.pNewGlobalWorkOffset = nullptr; + SaxpyKernel->UpdateDesc.pNewGlobalWorkSize = nullptr; + size_t newLocalSize = SaxpyKernel->LocalSize * 4; + SaxpyKernel->UpdateDesc.pNewLocalWorkSize = &newLocalSize; + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + CommandHandle, &SaxpyKernel->UpdateDesc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + ASSERT_NO_FATAL_FAILURE(SaxpyKernel->validate()); +} diff --git a/test/conformance/exp_command_buffer/ndrange_update.cpp b/test/conformance/exp_command_buffer/update/ndrange_update.cpp similarity index 56% rename from test/conformance/exp_command_buffer/ndrange_update.cpp rename to test/conformance/exp_command_buffer/update/ndrange_update.cpp index 5cbfffa9cd..ee18c33b32 100644 --- a/test/conformance/exp_command_buffer/ndrange_update.cpp +++ b/test/conformance/exp_command_buffer/update/ndrange_update.cpp @@ -3,7 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "fixtures.h" +#include "../fixtures.h" #include #include @@ -30,30 +30,22 @@ struct NDRangeUpdateTest std::memset(shared_ptr, 0, allocation_size); ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, shared_ptr)); - - // Add a 3 dimension kernel command to command-buffer and close - // command-buffer - ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( - updatable_cmd_buf_handle, kernel, n_dimensions, - global_offset.data(), global_size.data(), local_size.data(), 0, - nullptr, nullptr, &command_handle)); - ASSERT_NE(command_handle, nullptr); - - ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); } // For each work-item the kernel prints the global id and local id in each // of the 3 dimensions to an offset in the output based on global linear // id. void Validate(std::array global_size, - std::array local_size, + std::optional> local_size, std::array global_offset) { + // DPC++ swaps the X & Z dimension for 3 Dimensional kernels // between those set by user and SPIR-V builtins. // See `ReverseRangeDimensionsForKernel()` in commands.cpp - std::swap(global_size[0], global_size[2]); - std::swap(local_size[0], local_size[2]); + if (local_size.has_value()) { + std::swap(local_size.value()[0], local_size.value()[2]); + } std::swap(global_offset[0], global_offset[2]); // Verify global ID and local ID of each work item @@ -74,13 +66,15 @@ struct NDRangeUpdateTest EXPECT_EQ(global_id_y, y + global_offset[1]); EXPECT_EQ(global_id_z, z + global_offset[2]); - const int local_id_x = wi_ptr[3]; - const int local_id_y = wi_ptr[4]; - const int local_id_z = wi_ptr[5]; + if (local_size.has_value()) { + const int local_id_x = wi_ptr[3]; + const int local_id_y = wi_ptr[4]; + const int local_id_z = wi_ptr[5]; - EXPECT_EQ(local_id_x, x % local_size[0]); - EXPECT_EQ(local_id_y, y % local_size[1]); - EXPECT_EQ(local_id_z, z % local_size[2]); + EXPECT_EQ(local_id_x, x % local_size.value()[0]); + EXPECT_EQ(local_id_y, y % local_size.value()[1]); + EXPECT_EQ(local_id_z, z % local_size.value()[2]); + } } } } @@ -100,7 +94,7 @@ struct NDRangeUpdateTest } static constexpr size_t elements_per_id = 6; - static constexpr size_t n_dimensions = 3; + static constexpr uint32_t n_dimensions = 3; static constexpr std::array global_size = {8, 8, 8}; static constexpr std::array local_size = {1, 2, 2}; static constexpr std::array global_offset = {0, 4, 4}; @@ -113,10 +107,17 @@ struct NDRangeUpdateTest UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(NDRangeUpdateTest); -// Keep the kernel work dimensions as 3, and update local size and global -// offset. +// Add a 3 dimension kernel command to the command-buffer and update the +// local size and global offset TEST_P(NDRangeUpdateTest, Update3D) { - // Run command-buffer prior to update an verify output + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, global_offset.data(), + global_size.data(), local_size.data(), 0, nullptr, 0, nullptr, nullptr, + &command_handle)); + ASSERT_NE(command_handle, nullptr); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + // Run command-buffer prior to update and verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); @@ -129,10 +130,11 @@ TEST_P(NDRangeUpdateTest, Update3D) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 0, // numNewValueArgs - 3, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList nullptr, // pNewValueArgList @@ -152,9 +154,17 @@ TEST_P(NDRangeUpdateTest, Update3D) { Validate(new_global_size, new_local_size, new_global_offset); } -// Update the kernel work dimensions to use 1 in the Z dimension, -// and update global size, local size, and global offset to new values. +// Add a 3 dimension kernel command to the command-buffer. Update the kernel +// work dimensions to be 1 in the Z dimension, and update global size, local +// size, and global offset to new values. TEST_P(NDRangeUpdateTest, Update2D) { + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, global_offset.data(), + global_size.data(), local_size.data(), 0, nullptr, 0, nullptr, nullptr, + &command_handle)); + ASSERT_NE(command_handle, nullptr); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + // Run command-buffer prior to update an verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); @@ -173,10 +183,11 @@ TEST_P(NDRangeUpdateTest, Update2D) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 0, // numNewValueArgs - 3, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList nullptr, // pNewValueArgList @@ -200,10 +211,18 @@ TEST_P(NDRangeUpdateTest, Update2D) { Validate(new_global_size, new_local_size, new_global_offset); } -// Update the kernel work dimensions to be 1 in Y & Z dimensions, and check -// that the previously set global size, local size, and global offset update +// Add a 3 dimension kernel command to the command-buffer. Update the kernel +// work dimensions to be 1 in the Y & Z dimensions, and check that the +// previously set global size, local size, and global offset update // accordingly. TEST_P(NDRangeUpdateTest, Update1D) { + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, global_offset.data(), + global_size.data(), local_size.data(), 0, nullptr, 0, nullptr, nullptr, + &command_handle)); + ASSERT_NE(command_handle, nullptr); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + // Run command-buffer prior to update an verify output ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); @@ -217,10 +236,11 @@ TEST_P(NDRangeUpdateTest, Update1D) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 0, // numNewPointerArgs 0, // numNewValueArgs - 3, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList nullptr, // pNewPointerArgList nullptr, // pNewValueArgList @@ -244,26 +264,108 @@ TEST_P(NDRangeUpdateTest, Update1D) { Validate(new_global_size, new_local_size, new_global_offset); } -// Test error code is returned if work dimension parameter changes -TEST_P(NDRangeUpdateTest, Invalid) { - const size_t new_work_dim = n_dimensions - 1; +// Test that setting `pNewLocalWorkSize` to a non-NULL value when the command +// was created with a NULL local work size works. +TEST_P(NDRangeUpdateTest, ImplToUserDefinedLocalSize) { + + // Append a kernel node without setting the local work-size. + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, global_offset.data(), + global_size.data(), nullptr, 0, nullptr, 0, nullptr, nullptr, + &command_handle)); + ASSERT_NE(command_handle, nullptr); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + // Run command-buffer prior to update an verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Can't validate the local size because it is generated by the + // implementation. + Validate(global_size, std::nullopt, global_offset); + + // Set local size and global offset to update to + std::array new_local_size = {4, 2, 2}; + std::array new_global_offset = {3, 2, 1}; + std::array new_global_size = global_size; + + // Set a user-defined local work-size in the update desc. ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 0, // numNewValueArgs - new_work_dim, // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - nullptr, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - nullptr, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 0, // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + nullptr, // pNewValueArgList + new_global_offset.data(), // pNewGlobalWorkOffset + new_global_size.data(), // pNewGlobalWorkSize + new_local_size.data(), // pNewLocalWorkSize }; - // Update command to command-buffer to use different work dim - ur_result_t result = - urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc); - ASSERT_EQ(UR_RESULT_ERROR_INVALID_OPERATION, result); + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that the user defined local work-size was set correctly. + Validate(new_global_size, new_local_size, new_global_offset); +} + +// Test that setting `pNewLocalWorkSize` to a NULL value when the command was +// created with a non-NULL local work size works. +TEST_P(NDRangeUpdateTest, UserToImplDefinedLocalSize) { + + // Append a kernel node and set a user defined local work-size. + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, global_offset.data(), + global_size.data(), local_size.data(), 0, nullptr, 0, nullptr, nullptr, + &command_handle)); + ASSERT_NE(command_handle, nullptr); + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + + // Run command-buffer prior to update and verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + Validate(global_size, local_size, global_offset); + + // Set local size and global offset to update to + std::array new_global_offset = {3, 2, 1}; + std::array new_global_size = global_size; + + // Do not set a local-work size in the update desc to let the implementation + // decide which local-work size should be used. + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + kernel, // hNewKernel + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 0, // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + nullptr, // pNewValueArgList + new_global_offset.data(), // pNewGlobalWorkOffset + new_global_size.data(), // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that the kernel ran successfully and the global size is the + // expected. Cannot check the local size since it's implementation defined. + Validate(new_global_size, std::nullopt, new_global_offset); } diff --git a/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/update/usm_fill_kernel_update.cpp similarity index 93% rename from test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp rename to test/conformance/exp_command_buffer/update/usm_fill_kernel_update.cpp index 2bf9755c21..bd2ca727e8 100644 --- a/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/update/usm_fill_kernel_update.cpp @@ -3,7 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "fixtures.h" +#include "../fixtures.h" #include #include @@ -38,7 +38,8 @@ struct USMFillCommandTest // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + &global_size, &local_size, 0, nullptr, 0, nullptr, nullptr, + &command_handle)); ASSERT_NE(command_handle, nullptr); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); @@ -71,7 +72,7 @@ struct USMFillCommandTest static constexpr size_t local_size = 4; static constexpr size_t global_size = 32; static constexpr size_t global_offset = 0; - static constexpr size_t n_dimensions = 1; + static constexpr uint32_t n_dimensions = 1; static constexpr size_t allocation_size = sizeof(val) * global_size; void *shared_ptr = nullptr; void *new_shared_ptr = nullptr; @@ -120,16 +121,17 @@ TEST_P(USMFillCommandTest, UpdateParameters) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext - 0, // numNewMemObjArgs - 1, // numNewPointerArgs - 1, // numNewValueArgs - static_cast(n_dimensions), // newWorkDim - nullptr, // pNewMemObjArgList - &new_output_desc, // pNewPointerArgList - &new_input_desc, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - &new_global_size, // pNewGlobalWorkSize - &new_local_size, // pNewLocalWorkSize + kernel, // hNewKernel + 0, // numNewMemObjArgs + 1, // numNewPointerArgs + 1, // numNewValueArgs + n_dimensions, // newWorkDim + nullptr, // pNewMemObjArgList + &new_output_desc, // pNewPointerArgList + &new_input_desc, // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + &new_global_size, // pNewGlobalWorkSize + &new_local_size, // pNewLocalWorkSize }; // Update kernel and enqueue command-buffer again @@ -173,10 +175,11 @@ TEST_P(USMFillCommandTest, UpdateBeforeEnqueue) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 1, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList &new_output_desc, // pNewPointerArgList &new_input_desc, // pNewValueArgList @@ -234,7 +237,7 @@ struct USMMultipleFillCommandTest // Append kernel and store returned handle ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &elements, &local_size, 0, nullptr, nullptr, + &elements, &local_size, 0, nullptr, 0, nullptr, nullptr, &command_handles[k])); ASSERT_NE(command_handles[k], nullptr); } @@ -324,10 +327,11 @@ TEST_P(USMMultipleFillCommandTest, UpdateAllKernels) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 1, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList &new_output_desc, // pNewPointerArgList &new_input_desc, // pNewValueArgList diff --git a/test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp b/test/conformance/exp_command_buffer/update/usm_saxpy_kernel_update.cpp similarity index 96% rename from test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp rename to test/conformance/exp_command_buffer/update/usm_saxpy_kernel_update.cpp index 7f9b5d7f69..e424af4b78 100644 --- a/test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/update/usm_saxpy_kernel_update.cpp @@ -3,7 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "fixtures.h" +#include "../fixtures.h" #include #include @@ -82,7 +82,8 @@ struct USMSaxpyKernelTest : USMSaxpyKernelTestBase { // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + &global_size, &local_size, 0, nullptr, 0, nullptr, nullptr, + &command_handle)); ASSERT_NE(command_handle, nullptr); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); @@ -148,10 +149,11 @@ TEST_P(USMSaxpyKernelTest, UpdateParameters) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 2, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList new_input_descs, // pNewPointerArgList &new_A_desc, // pNewValueArgList @@ -182,7 +184,7 @@ struct USMMultiSaxpyKernelTest : USMSaxpyKernelTestBase { for (unsigned node = 0; node < nodes; node++) { ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, + &global_size, &local_size, 0, nullptr, 0, nullptr, nullptr, &command_handles[node])); ASSERT_NE(command_handles[node], nullptr); } @@ -253,10 +255,11 @@ TEST_P(USMMultiSaxpyKernelTest, UpdateParameters) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 2, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList new_input_descs, // pNewPointerArgList &new_A_desc, // pNewValueArgList @@ -318,10 +321,11 @@ TEST_P(USMMultiSaxpyKernelTest, UpdateWithoutBlocking) { ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext + kernel, // hNewKernel 0, // numNewMemObjArgs 2, // numNewPointerArgs 1, // numNewValueArgs - 0, // newWorkDim + n_dimensions, // newWorkDim nullptr, // pNewMemObjArgList new_input_descs, // pNewPointerArgList &new_A_desc, // pNewValueArgList diff --git a/test/conformance/testing/include/uur/raii.h b/test/conformance/testing/include/uur/raii.h index e4f456ec36..894a66dfdd 100644 --- a/test/conformance/testing/include/uur/raii.h +++ b/test/conformance/testing/include/uur/raii.h @@ -108,6 +108,12 @@ using Program = Wrapper; using Kernel = Wrapper; using Queue = Wrapper; using Event = Wrapper; +using CommandBuffer = + Wrapper; +using CommandBufferCommand = + Wrapper; } // namespace raii } // namespace uur diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp index 22f4ec6413..59f4a8e5b2 100644 --- a/tools/urinfo/urinfo.hpp +++ b/tools/urinfo/urinfo.hpp @@ -334,8 +334,8 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, printDeviceInfo(hDevice, UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP); std::cout << prefix; - printDeviceInfo( - hDevice, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP); + printDeviceInfo( + hDevice, UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_CAPABILITIES_EXP); std::cout << prefix; printDeviceInfo(hDevice, UR_DEVICE_INFO_CLUSTER_LAUNCH_EXP); std::cout << prefix;