From 41a0841751263aeed5d8e3b646e410b1dcac66f0 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Wed, 3 Jan 2024 13:27:16 +0000 Subject: [PATCH 1/9] Merge pull request #1198 from al42and/aa-rocm6 [HIP] Fix build with ROCm 6.0.0 --- source/adapters/hip/common.hpp | 35 +++++++++++++++++++++++---------- source/adapters/hip/enqueue.cpp | 8 ++++---- source/adapters/hip/kernel.cpp | 2 +- source/adapters/hip/usm.cpp | 10 +++++++++- 4 files changed, 39 insertions(+), 16 deletions(-) diff --git a/source/adapters/hip/common.hpp b/source/adapters/hip/common.hpp index 2649657f47..d7eea780a5 100644 --- a/source/adapters/hip/common.hpp +++ b/source/adapters/hip/common.hpp @@ -15,24 +15,39 @@ #include #include -// Hipify doesn't support cuArrayGetDescriptor, on AMD the hipArray can just be -// indexed, but on NVidia it is an opaque type and needs to go through -// cuArrayGetDescriptor so implement a utility function to get the array -// properties -inline void getArrayDesc(hipArray *Array, hipArray_Format &Format, - size_t &Channels) { +// Before ROCm 6, hipify doesn't support cuArrayGetDescriptor, on AMD the +// hipArray can just be indexed, but on NVidia it is an opaque type and needs to +// go through cuArrayGetDescriptor so implement a utility function to get the +// array properties +inline static hipError_t getArrayDesc(hipArray *Array, hipArray_Format &Format, + size_t &Channels) { +#if HIP_VERSION_MAJOR >= 6 + HIP_ARRAY_DESCRIPTOR ArrayDesc; + hipError_t err = hipArrayGetDescriptor(&ArrayDesc, Array); + if (err == hipSuccess) { + Format = ArrayDesc.Format; + Channels = ArrayDesc.NumChannels; + } + return err; +#else #if defined(__HIP_PLATFORM_AMD__) Format = Array->Format; Channels = Array->NumChannels; + return hipSuccess; #elif defined(__HIP_PLATFORM_NVIDIA__) CUDA_ARRAY_DESCRIPTOR ArrayDesc; - cuArrayGetDescriptor(&ArrayDesc, (CUarray)Array); - - Format = ArrayDesc.Format; - Channels = ArrayDesc.NumChannels; + CUresult err = cuArrayGetDescriptor(&ArrayDesc, (CUarray)Array); + if (err == CUDA_SUCCESS) { + Format = ArrayDesc.Format; + Channels = ArrayDesc.NumChannels; + return hipSuccess; + } else { + return hipErrorUnknown; // No easy way to map CUerror to hipError + } #else #error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); #endif +#endif } // HIP on NVIDIA headers guard hipArray3DCreate behind __CUDACC__, this does not diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index ebebcc27b5..109a248e16 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -898,7 +898,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( hipArray_Format Format; size_t NumChannels; - getArrayDesc(Array, Format, NumChannels); + UR_CHECK_ERROR(getArrayDesc(Array, Format, NumChannels)); int ElementByteSize = imageElementByteSize(Format); @@ -959,7 +959,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( hipArray_Format Format; size_t NumChannels; - getArrayDesc(Array, Format, NumChannels); + UR_CHECK_ERROR(getArrayDesc(Array, Format, NumChannels)); int ElementByteSize = imageElementByteSize(Format); @@ -1023,12 +1023,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( hipArray *SrcArray = std::get(hImageSrc->Mem).getArray(); hipArray_Format SrcFormat; size_t SrcNumChannels; - getArrayDesc(SrcArray, SrcFormat, SrcNumChannels); + UR_CHECK_ERROR(getArrayDesc(SrcArray, SrcFormat, SrcNumChannels)); hipArray *DstArray = std::get(hImageDst->Mem).getArray(); hipArray_Format DstFormat; size_t DstNumChannels; - getArrayDesc(DstArray, DstFormat, DstNumChannels); + UR_CHECK_ERROR(getArrayDesc(DstArray, DstFormat, DstNumChannels)); UR_ASSERT(SrcFormat == DstFormat, UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index cc6f4384bc..bdd5f63fb2 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -279,7 +279,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( auto array = std::get(hArgValue->Mem).getArray(); hipArray_Format Format; size_t NumChannels; - getArrayDesc(array, Format, NumChannels); + UR_CHECK_ERROR(getArrayDesc(array, Format, NumChannels)); if (Format != HIP_AD_FORMAT_UNSIGNED_INT32 && Format != HIP_AD_FORMAT_SIGNED_INT32 && Format != HIP_AD_FORMAT_HALF && Format != HIP_AD_FORMAT_FLOAT) { diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp index 7af7401f87..8854748da9 100644 --- a/source/adapters/hip/usm.cpp +++ b/source/adapters/hip/usm.cpp @@ -73,7 +73,11 @@ UR_APIEXPORT ur_result_t UR_APICALL USMFreeImpl(ur_context_handle_t hContext, ScopedContext Active(hContext->getDevice()); hipPointerAttribute_t hipPointerAttributeType; UR_CHECK_ERROR(hipPointerGetAttributes(&hipPointerAttributeType, pMem)); - unsigned int Type = hipPointerAttributeType.memoryType; +#if HIP_VERSION >= 50600000 + const auto Type = hipPointerAttributeType.type; +#else + const auto Type = hipPointerAttributeType.memoryType; +#endif UR_ASSERT(Type == hipMemoryTypeDevice || Type == hipMemoryTypeHost, UR_RESULT_ERROR_INVALID_MEM_OBJECT); if (Type == hipMemoryTypeDevice) { @@ -171,7 +175,11 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, return ReturnValue(UR_USM_TYPE_SHARED); } UR_CHECK_ERROR(hipPointerGetAttributes(&hipPointerAttributeType, pMem)); +#if HIP_VERSION >= 50600000 + Value = hipPointerAttributeType.type; +#else Value = hipPointerAttributeType.memoryType; +#endif UR_ASSERT(Value == hipMemoryTypeDevice || Value == hipMemoryTypeHost, UR_RESULT_ERROR_INVALID_MEM_OBJECT); if (Value == hipMemoryTypeDevice) { From 7b5f58bd7668d83e3dcf4bf2636d140b04024e31 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Thu, 4 Jan 2024 15:12:54 +0000 Subject: [PATCH 2/9] Merge pull request #1222 from sommerlukas/lukas/comgr-include-rocm4 [UR][HIP] Fix include for AMD COMGR --- source/adapters/hip/CMakeLists.txt | 19 +++++++++++++++++-- source/adapters/hip/common.hpp | 4 ++++ source/adapters/hip/program.cpp | 4 ++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/source/adapters/hip/CMakeLists.txt b/source/adapters/hip/CMakeLists.txt index 7de1b5f501..84893caa55 100644 --- a/source/adapters/hip/CMakeLists.txt +++ b/source/adapters/hip/CMakeLists.txt @@ -98,6 +98,21 @@ if("${UR_HIP_PLATFORM}" STREQUAL "AMD") ) if(UR_ENABLE_COMGR) + set(UR_COMGR_VERSION5_HEADER "${UR_HIP_INCLUDE_DIR}/amd_comgr/amd_comgr.h") + set(UR_COMGR_VERSION4_HEADER "${UR_HIP_INCLUDE_DIR}/amd_comgr.h") + # The COMGR header changed location between ROCm versions 4 and 5. + # Check for existence in the version 5 location or fallback to version 4 + if(NOT EXISTS "${UR_COMGR_VERSION5_HEADER}") + if(NOT EXISTS "${UR_COMGR_VERSION4_HEADER}") + message(FATAL_ERROR "Could not find AMD COMGR header at " + "${UR_COMGR_VERSION5_HEADER} or" + "${UR_COMGR_VERSION4_HEADER}, " + "check ROCm installation") + else() + target_compile_definitions(${TARGET_NAME} PRIVATE UR_COMGR_VERSION4_INCLUDE) + endif() + endif() + add_library(amd_comgr SHARED IMPORTED GLOBAL) set_target_properties( amd_comgr PROPERTIES @@ -105,8 +120,8 @@ if("${UR_HIP_PLATFORM}" STREQUAL "AMD") INTERFACE_INCLUDE_DIRECTORIES "${HIP_HEADERS}" INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${HIP_HEADERS}" ) - target_link_libraries(pi_hip PUBLIC amd_comgr) - target_compile_definitions(pi_hip PRIVATE SYCL_ENABLE_KERNEL_FUSION) + target_link_libraries(${TARGET_NAME} PUBLIC amd_comgr) + target_compile_definitions(${TARGET_NAME} PRIVATE SYCL_ENABLE_KERNEL_FUSION) endif(UR_ENABLE_COMGR) target_link_libraries(${TARGET_NAME} PRIVATE diff --git a/source/adapters/hip/common.hpp b/source/adapters/hip/common.hpp index d7eea780a5..be332c280b 100644 --- a/source/adapters/hip/common.hpp +++ b/source/adapters/hip/common.hpp @@ -10,8 +10,12 @@ #pragma once #ifdef SYCL_ENABLE_KERNEL_FUSION +#ifdef UR_COMGR_VERSION4_INCLUDE +#include +#else #include #endif +#endif #include #include diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp index fa38384e62..f5757142ca 100644 --- a/source/adapters/hip/program.cpp +++ b/source/adapters/hip/program.cpp @@ -11,7 +11,11 @@ #include "program.hpp" #ifdef SYCL_ENABLE_KERNEL_FUSION +#ifdef UR_COMGR_VERSION4_INCLUDE +#include +#else #include +#endif namespace { template struct COMgrObjCleanUp { From c9d48102eff4500910dbcb37d62157c55ef72d54 Mon Sep 17 00:00:00 2001 From: Weronika Lewandowska Date: Wed, 20 Dec 2023 11:17:05 +0100 Subject: [PATCH 3/9] Merge pull request #1203 from pbalcer/random-coverity-issues [cuda][null][common] fix a few coverity issues --- source/adapters/cuda/device.cpp | 5 ++++- source/adapters/cuda/event.cpp | 6 +++++- source/adapters/cuda/sampler.cpp | 1 - source/adapters/null/ur_null.cpp | 4 ++-- source/common/ur_util.hpp | 2 ++ 5 files changed, 13 insertions(+), 5 deletions(-) diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index 8d95ad05e8..4875adfed1 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -16,6 +16,7 @@ #include "context.hpp" #include "device.hpp" #include "platform.hpp" +#include "ur_util.hpp" int getAttribute(ur_device_handle_t device, CUdevice_attribute attribute) { int value; @@ -40,7 +41,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, ur_device_info_t propName, size_t propSize, void *pPropValue, - size_t *pPropSizeRet) { + size_t *pPropSizeRet) try { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); static constexpr uint32_t MaxWorkItemDimensions = 3u; @@ -1033,6 +1034,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, break; } return UR_RESULT_ERROR_INVALID_ENUMERATION; +} catch (...) { + return exceptionToResult(std::current_exception()); } /// \return PI_SUCCESS if the function is executed successfully diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index 2cbfcbc39b..804b35a9b7 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -12,6 +12,8 @@ #include "context.hpp" #include "device.hpp" #include "queue.hpp" +#include "ur_api.h" +#include "ur_util.hpp" #include #include @@ -65,7 +67,7 @@ ur_result_t ur_event_handle_t_::start() { return Result; } -bool ur_event_handle_t_::isCompleted() const noexcept { +bool ur_event_handle_t_::isCompleted() const noexcept try { if (!IsRecorded) { return false; } @@ -80,6 +82,8 @@ bool ur_event_handle_t_::isCompleted() const noexcept { } } return true; +} catch (...) { + return exceptionToResult(std::current_exception()) == UR_RESULT_SUCCESS; } uint64_t ur_event_handle_t_::getQueuedTime() const { diff --git a/source/adapters/cuda/sampler.cpp b/source/adapters/cuda/sampler.cpp index 0e1305da23..ce4283edd3 100644 --- a/source/adapters/cuda/sampler.cpp +++ b/source/adapters/cuda/sampler.cpp @@ -71,7 +71,6 @@ urSamplerGetInfo(ur_sampler_handle_t hSampler, ur_sampler_info_t propName, default: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } - return {}; } UR_APIEXPORT ur_result_t UR_APICALL diff --git a/source/adapters/null/ur_null.cpp b/source/adapters/null/ur_null.cpp index 5a62761b67..a64d46e4d4 100644 --- a/source/adapters/null/ur_null.cpp +++ b/source/adapters/null/ur_null.cpp @@ -173,7 +173,7 @@ context_t::context_t() { return UR_RESULT_ERROR_UNSUPPORTED_SIZE; } *ppMem = malloc(size); - if (ppMem == nullptr) { + if (*ppMem == nullptr) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } return UR_RESULT_SUCCESS; @@ -189,7 +189,7 @@ context_t::context_t() { return UR_RESULT_ERROR_UNSUPPORTED_SIZE; } *ppMem = malloc(size); - if (ppMem == nullptr) { + if (*ppMem == nullptr) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } return UR_RESULT_SUCCESS; diff --git a/source/common/ur_util.hpp b/source/common/ur_util.hpp index a73f348b52..00aaf8eee2 100644 --- a/source/common/ur_util.hpp +++ b/source/common/ur_util.hpp @@ -288,6 +288,8 @@ inline ur_result_t exceptionToResult(std::exception_ptr eptr) { return UR_RESULT_SUCCESS; } catch (std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (const ur_result_t &e) { + return e; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } From 24e9815ee254c1c0cbe819226d8811caa77777dc Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Tue, 16 Jan 2024 10:38:22 +0000 Subject: [PATCH 4/9] Merge pull request #1206 from ProGTX/peter/werror Werror fixes --- CMakeLists.txt | 3 -- cmake/helpers.cmake | 8 ++++- source/adapters/cuda/device.cpp | 3 +- source/adapters/cuda/image.cpp | 4 +-- source/adapters/cuda/program.cpp | 3 +- source/adapters/cuda/sampler.cpp | 2 +- source/adapters/hip/enqueue.cpp | 21 ++---------- source/adapters/hip/memory.cpp | 32 +++---------------- source/adapters/level_zero/CMakeLists.txt | 2 +- source/adapters/native_cpu/context.cpp | 8 ++--- .../cuda/urDeviceCreateWithNativeHandle.cpp | 4 +-- 11 files changed, 28 insertions(+), 62 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 54aef2f7ef..631699a9c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,9 +111,6 @@ if(UR_ENABLE_TRACING) ) if (MSVC) set(TARGET_XPTI $,xpti,xptid>) - - # disable warning C4267: The compiler detected a conversion from size_t to a smaller type. - target_compile_options(xptifw PRIVATE /wd4267) else() set(TARGET_XPTI xpti) endif() diff --git a/cmake/helpers.cmake b/cmake/helpers.cmake index 3c90d41236..05878d870f 100644 --- a/cmake/helpers.cmake +++ b/cmake/helpers.cmake @@ -82,10 +82,16 @@ function(add_ur_target_compile_options name) /W3 /MD$<$:d> /GS + /DWIN32_LEAN_AND_MEAN + /DNOMINMAX ) if(UR_DEVELOPER_MODE) - target_compile_options(${name} PRIVATE /WX /GS) + # _CRT_SECURE_NO_WARNINGS used mainly because of getenv + # C4267: The compiler detected a conversion from size_t to a smaller type. + target_compile_options(${name} PRIVATE + /WX /GS /D_CRT_SECURE_NO_WARNINGS /wd4267 + ) endif() endif() endfunction() diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index 4875adfed1..ee766cfc1c 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -1100,7 +1100,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( ur_device_handle_t hDevice, ur_native_handle_t *phNativeHandle) { - *phNativeHandle = reinterpret_cast(hDevice->get()); + *phNativeHandle = reinterpret_cast( + static_cast(hDevice->get())); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp index 1f336dd2d7..3168c008a3 100644 --- a/source/adapters/cuda/image.cpp +++ b/source/adapters/cuda/image.cpp @@ -146,7 +146,7 @@ urToCudaImageChannelFormat(ur_image_channel_type_t image_channel_type, std::make_pair(image_channel_type, num_channels)); cuda_format = cuda_format_and_size.first; pixel_size_bytes = cuda_format_and_size.second; - } catch (std::out_of_range &e) { + } catch (const std::out_of_range &) { return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; } } @@ -276,7 +276,7 @@ ur_result_t urTextureCreate(ur_sampler_handle_t hSampler, ImageTexDesc.mipmapFilterMode = MipFilterMode; ImageTexDesc.maxMipmapLevelClamp = hSampler->MaxMipmapLevelClamp; ImageTexDesc.minMipmapLevelClamp = hSampler->MinMipmapLevelClamp; - ImageTexDesc.maxAnisotropy = hSampler->MaxAnisotropy; + ImageTexDesc.maxAnisotropy = static_cast(hSampler->MaxAnisotropy); // The address modes can interfere with other dimensionsenqueueEventsWait // e.g. 1D texture sampling can be interfered with when setting other diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp index 9b7959eb85..022fd258f7 100644 --- a/source/adapters/cuda/program.cpp +++ b/source/adapters/cuda/program.cpp @@ -141,7 +141,8 @@ ur_result_t ur_program_handle_t_::buildProgram(const char *BuildOptions) { getMaxRegistersJitOptionValue(this->BuildOptions, MaxRegs); if (Valid) { Options.push_back(CU_JIT_MAX_REGISTERS); - OptionVals.push_back(reinterpret_cast(MaxRegs)); + OptionVals.push_back( + reinterpret_cast(static_cast(MaxRegs))); } } diff --git a/source/adapters/cuda/sampler.cpp b/source/adapters/cuda/sampler.cpp index ce4283edd3..5ebccf516b 100644 --- a/source/adapters/cuda/sampler.cpp +++ b/source/adapters/cuda/sampler.cpp @@ -18,7 +18,7 @@ urSamplerCreate(ur_context_handle_t hContext, const ur_sampler_desc_t *pDesc, new ur_sampler_handle_t_(hContext)}; if (pDesc->stype == UR_STRUCTURE_TYPE_SAMPLER_DESC) { - Sampler->Props |= pDesc->normalizedCoords; + Sampler->Props |= static_cast(pDesc->normalizedCoords); Sampler->Props |= pDesc->filterMode << 1; Sampler->Props |= pDesc->addressingMode << 2; } else { diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 109a248e16..f17c8da5ce 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -15,26 +15,9 @@ #include "memory.hpp" #include "queue.hpp" -namespace { +extern size_t imageElementByteSize(hipArray_Format ArrayFormat); -static size_t imageElementByteSize(hipArray_Format ArrayFormat) { - switch (ArrayFormat) { - case HIP_AD_FORMAT_UNSIGNED_INT8: - case HIP_AD_FORMAT_SIGNED_INT8: - return 1; - case HIP_AD_FORMAT_UNSIGNED_INT16: - case HIP_AD_FORMAT_SIGNED_INT16: - case HIP_AD_FORMAT_HALF: - return 2; - case HIP_AD_FORMAT_UNSIGNED_INT32: - case HIP_AD_FORMAT_SIGNED_INT32: - case HIP_AD_FORMAT_FLOAT: - return 4; - default: - detail::ur::die("Invalid image format."); - } - return 0; -} +namespace { ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, hipStream_t Stream, uint32_t NumEventsInWaitList, diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp index 899dad5674..827980be5e 100644 --- a/source/adapters/hip/memory.cpp +++ b/source/adapters/hip/memory.cpp @@ -13,10 +13,8 @@ #include #include -namespace { - -size_t GetHipFormatPixelSize(hipArray_Format Format) { - switch (Format) { +size_t imageElementByteSize(hipArray_Format ArrayFormat) { + switch (ArrayFormat) { case HIP_AD_FORMAT_UNSIGNED_INT8: case HIP_AD_FORMAT_SIGNED_INT8: return 1; @@ -31,10 +29,9 @@ size_t GetHipFormatPixelSize(hipArray_Format Format) { default: detail::ur::die("Invalid HIP format specifier"); } + return 0; } -} // namespace - /// Decreases the reference count of the Mem object. /// If this is zero, calls the relevant HIP Free function /// \return UR_RESULT_SUCCESS unless deallocation error @@ -280,7 +277,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, UR_CHECK_ERROR( hipArray3DGetDescriptor(&ArrayDescriptor, Mem.getArray())); const auto PixelSizeBytes = - GetHipFormatPixelSize(ArrayDescriptor.Format) * + imageElementByteSize(ArrayDescriptor.Format) * ArrayDescriptor.NumChannels; const auto ImageSizeBytes = PixelSizeBytes * @@ -573,25 +570,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory, } }; - const auto hipFormatToElementSize = - [](hipArray_Format HipFormat) -> size_t { - switch (HipFormat) { - case HIP_AD_FORMAT_UNSIGNED_INT8: - case HIP_AD_FORMAT_SIGNED_INT8: - return 1; - case HIP_AD_FORMAT_UNSIGNED_INT16: - case HIP_AD_FORMAT_SIGNED_INT16: - case HIP_AD_FORMAT_HALF: - return 2; - case HIP_AD_FORMAT_UNSIGNED_INT32: - case HIP_AD_FORMAT_SIGNED_INT32: - case HIP_AD_FORMAT_FLOAT: - return 4; - default: - detail::ur::die("Invalid Hip format specified."); - } - }; - switch (propName) { case UR_IMAGE_INFO_FORMAT: return ReturnValue(ur_image_format_t{UR_IMAGE_CHANNEL_ORDER_RGBA, @@ -603,7 +581,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory, case UR_IMAGE_INFO_DEPTH: return ReturnValue(ArrayInfo.Depth); case UR_IMAGE_INFO_ELEMENT_SIZE: - return ReturnValue(hipFormatToElementSize(ArrayInfo.Format)); + return ReturnValue(imageElementByteSize(ArrayInfo.Format)); case UR_IMAGE_INFO_ROW_PITCH: case UR_IMAGE_INFO_SLICE_PITCH: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index 7b24223b95..0d2f967cdc 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -121,7 +121,7 @@ add_ur_adapter(${TARGET_NAME} # TODO: fix level_zero adapter conversion warnings target_compile_options(${TARGET_NAME} PRIVATE - $<$:/wd4267 /wd4805 /wd4244 /D_CRT_SECURE_NO_WARNINGS> + $<$:/wd4805 /wd4244> ) set_target_properties(${TARGET_NAME} PROPERTIES diff --git a/source/adapters/native_cpu/context.cpp b/source/adapters/native_cpu/context.cpp index 962525d1fc..c485725828 100644 --- a/source/adapters/native_cpu/context.cpp +++ b/source/adapters/native_cpu/context.cpp @@ -17,10 +17,10 @@ #include "common.hpp" #include "context.hpp" -UR_APIEXPORT ur_result_t UR_APICALL -urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, - const ur_context_properties_t *pProperties, - ur_context_handle_t *phContext) { +UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( + [[maybe_unused]] uint32_t DeviceCount, const ur_device_handle_t *phDevices, + const ur_context_properties_t *pProperties, + ur_context_handle_t *phContext) { std::ignore = pProperties; assert(DeviceCount == 1); diff --git a/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp b/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp index 3b8ebc416b..dca7932606 100644 --- a/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp +++ b/test/adapters/cuda/urDeviceCreateWithNativeHandle.cpp @@ -15,8 +15,8 @@ TEST_F(urCudaDeviceCreateWithNativeHandle, Success) { CUdevice cudaDevice; ASSERT_SUCCESS_CUDA(cuDeviceGet(&cudaDevice, 0)); - ur_native_handle_t nativeCuda = - reinterpret_cast(cudaDevice); + ur_native_handle_t nativeCuda = reinterpret_cast( + static_cast(cudaDevice)); ur_device_handle_t urDevice; ASSERT_SUCCESS(urDeviceCreateWithNativeHandle(nativeCuda, platform, nullptr, &urDevice)); From 34958671c5a63ad5eb6d4f550e1f38f32b40b5bb Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Wed, 17 Jan 2024 12:35:19 +0000 Subject: [PATCH 5/9] Merge pull request #1245 from nrspruit/enable_relaxed_alloc [L0] Only Override max allocation limits given env --- source/adapters/level_zero/device.cpp | 26 +++++++++++--------------- source/adapters/level_zero/device.hpp | 2 +- source/adapters/level_zero/program.cpp | 4 ++-- source/adapters/level_zero/usm.cpp | 4 ++-- 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 05b66e12f4..c132e28738 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -10,6 +10,7 @@ #include "device.hpp" #include "ur_level_zero.hpp" +#include "ur_util.hpp" #include #include #include @@ -268,9 +269,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(uint32_t{64}); } case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: - // if not optimized for 32-bit access, return total memory size. - // otherwise, return only maximum allocatable size. - if (Device->useOptimized32bitAccess() == 0) { + // if the user wishes to allocate large allocations on a system that usually + // does not allow that allocation size, then we return the max global mem + // size as the limit. + if (Device->useRelaxedAllocationLimits()) { return ReturnValue(uint64_t{calculateGlobalMemSize(Device)}); } else { return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize}); @@ -920,20 +922,14 @@ ur_device_handle_t_::useImmediateCommandLists() { } } -int32_t ur_device_handle_t_::useOptimized32bitAccess() { - static const int32_t Optimize32bitAccessMode = [this] { - // If device is Intel(R) Data Center GPU Max, - // use default provided by L0 driver. - // TODO: Use IP versioning to select based on range of devices - if (this->isPVC()) - return -1; - const char *UrRet = std::getenv("UR_L0_USE_OPTIMIZED_32BIT_ACCESS"); - if (!UrRet) - return 0; - return std::atoi(UrRet); +bool ur_device_handle_t_::useRelaxedAllocationLimits() { + static const bool EnableRelaxedAllocationLimits = [] { + auto UrRet = ur_getenv("UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS"); + const bool RetVal = UrRet ? std::stoi(*UrRet) : 0; + return RetVal; }(); - return Optimize32bitAccessMode; + return EnableRelaxedAllocationLimits; } ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 3b91b70058..94480336c5 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -160,7 +160,7 @@ struct ur_device_handle_t_ : _ur_object { // provide support for only one, like for Intel(R) // Data Center GPU Max, for which L0 driver only // supports stateless. - int32_t useOptimized32bitAccess(); + bool useRelaxedAllocationLimits(); bool isSubDevice() { return RootDevice != nullptr; } diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp index f118a5b9dd..bb2d964422 100644 --- a/source/adapters/level_zero/program.cpp +++ b/source/adapters/level_zero/program.cpp @@ -161,7 +161,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp( ZeBuildOptions += pOptions; } - if (phDevices[0]->useOptimized32bitAccess() == 0) { + if (phDevices[0]->useRelaxedAllocationLimits()) { ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required"; } @@ -256,7 +256,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile( // ze-opt-greater-than-4GB-buffer-required to disable // stateful optimizations and be able to use larger than // 4GB allocations on these kernels. - if (Context->Devices[0]->useOptimized32bitAccess() == 0) { + if (Context->Devices[0]->useRelaxedAllocationLimits()) { Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required"; } } diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp index d2dfc9b37d..134316da86 100644 --- a/source/adapters/level_zero/usm.cpp +++ b/source/adapters/level_zero/usm.cpp @@ -178,11 +178,11 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr, ZeDesc.flags = 0; ZeDesc.ordinal = 0; - if (Device->useOptimized32bitAccess() == 0 && + ZeStruct RelaxedDesc; + if (Device->useRelaxedAllocationLimits() && (Size > Device->ZeDeviceProperties->maxMemAllocSize)) { // Tell Level-Zero to accept Size > maxMemAllocSize if // large allocations are used. - ZeStruct RelaxedDesc; RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; ZeDesc.pNext = &RelaxedDesc; } From 9e5bc9d5b1f0a62c0c045173acc4d5cce7e315d1 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Fri, 19 Jan 2024 15:52:55 +0000 Subject: [PATCH 6/9] Merge pull request #1260 from kbenzie/benie/dont-include-windows.h-in-ur_info.hpp [Common] Don't include Windows.h in ur_util.hpp --- source/common/CMakeLists.txt | 37 ++++++++-------- source/common/ur_lib_loader.hpp | 6 ++- source/common/ur_util.cpp | 44 +++++++++++++++++++ source/common/ur_util.hpp | 34 +------------- .../layers/validation/backtrace_win.cpp | 3 +- 5 files changed, 71 insertions(+), 53 deletions(-) create mode 100644 source/common/ur_util.cpp diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index f240f9908b..5b98413676 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -3,28 +3,27 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -add_library(ur_common INTERFACE) +add_subdirectory(unified_malloc_framework) +add_subdirectory(umf_pools) + +add_ur_library(ur_common STATIC + umf_helpers.hpp + ur_pool_manager.hpp + ur_util.cpp + ur_util.hpp + $<$:windows/ur_lib_loader.cpp> + $<$:linux/ur_lib_loader.cpp> +) add_library(${PROJECT_NAME}::common ALIAS ur_common) -target_include_directories(ur_common INTERFACE +target_include_directories(ur_common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include ) -add_subdirectory(unified_malloc_framework) -add_subdirectory(umf_pools) -target_link_libraries(ur_common INTERFACE unified_malloc_framework disjoint_pool ${CMAKE_DL_LIBS} ${PROJECT_NAME}::headers) - -if(WIN32) - target_sources(ur_common - INTERFACE - ${CMAKE_CURRENT_SOURCE_DIR}/windows/ur_lib_loader.cpp - umf_helpers.hpp ur_pool_manager.hpp - ) -else() - target_sources(ur_common - INTERFACE - ${CMAKE_CURRENT_SOURCE_DIR}/linux/ur_lib_loader.cpp - umf_helpers.hpp ur_pool_manager.hpp - ) -endif() +target_link_libraries(ur_common PUBLIC + unified_malloc_framework + disjoint_pool + ${CMAKE_DL_LIBS} + ${PROJECT_NAME}::headers +) diff --git a/source/common/ur_lib_loader.hpp b/source/common/ur_lib_loader.hpp index cd917e12cf..c26c9979d8 100644 --- a/source/common/ur_lib_loader.hpp +++ b/source/common/ur_lib_loader.hpp @@ -12,7 +12,11 @@ #include -#include "ur_util.hpp" +#if _WIN32 +#include +#else +#define HMODULE void * +#endif namespace ur_loader { diff --git a/source/common/ur_util.cpp b/source/common/ur_util.cpp new file mode 100644 index 0000000000..e486ff6e1a --- /dev/null +++ b/source/common/ur_util.cpp @@ -0,0 +1,44 @@ +/* + * + * Copyright (C) 2022-2023 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + */ + +#include "ur_util.hpp" + +#ifdef _WIN32 +#include +int ur_getpid(void) { return static_cast(GetCurrentProcessId()); } +#else + +#include +int ur_getpid(void) { return static_cast(getpid()); } +#endif + +std::optional ur_getenv(const char *name) { +#if defined(_WIN32) + constexpr int buffer_size = 1024; + char buffer[buffer_size]; + auto rc = GetEnvironmentVariableA(name, buffer, buffer_size); + if (0 != rc && rc < buffer_size) { + return std::string(buffer); + } else if (rc >= buffer_size) { + std::stringstream ex_ss; + ex_ss << "Environment variable " << name << " value too long!" + << " Maximum length is " << buffer_size - 1 << " characters."; + throw std::invalid_argument(ex_ss.str()); + } + return std::nullopt; +#else + const char *tmp_env = getenv(name); + if (tmp_env != nullptr) { + return std::string(tmp_env); + } else { + return std::nullopt; + } +#endif +} diff --git a/source/common/ur_util.hpp b/source/common/ur_util.hpp index 00aaf8eee2..bda8e835d7 100644 --- a/source/common/ur_util.hpp +++ b/source/common/ur_util.hpp @@ -21,14 +21,7 @@ #include #include -#ifdef _WIN32 -#include -inline int ur_getpid(void) { return static_cast(GetCurrentProcessId()); } -#else - -#include -inline int ur_getpid(void) { return static_cast(getpid()); } -#endif +int ur_getpid(void); /* for compatibility with non-clang compilers */ #if defined(__has_feature) @@ -62,7 +55,6 @@ inline int ur_getpid(void) { return static_cast(getpid()); } #include #define MAKE_LIBRARY_NAME(NAME, VERSION) NAME ".dll" #else -#define HMODULE void * #if defined(__APPLE__) #define MAKE_LIBRARY_NAME(NAME, VERSION) "lib" NAME "." VERSION ".dylib" #else @@ -94,29 +86,7 @@ inline std::string create_library_path(const char *name, const char *path) { #endif /////////////////////////////////////////////////////////////////////////////// -inline std::optional ur_getenv(const char *name) { -#if defined(_WIN32) - constexpr int buffer_size = 1024; - char buffer[buffer_size]; - auto rc = GetEnvironmentVariableA(name, buffer, buffer_size); - if (0 != rc && rc < buffer_size) { - return std::string(buffer); - } else if (rc >= buffer_size) { - std::stringstream ex_ss; - ex_ss << "Environment variable " << name << " value too long!" - << " Maximum length is " << buffer_size - 1 << " characters."; - throw std::invalid_argument(ex_ss.str()); - } - return std::nullopt; -#else - const char *tmp_env = getenv(name); - if (tmp_env != nullptr) { - return std::string(tmp_env); - } else { - return std::nullopt; - } -#endif -} +std::optional ur_getenv(const char *name); inline bool getenv_tobool(const char *name) { auto env = ur_getenv(name); diff --git a/source/loader/layers/validation/backtrace_win.cpp b/source/loader/layers/validation/backtrace_win.cpp index 5d63b0be6a..62a023a4ec 100644 --- a/source/loader/layers/validation/backtrace_win.cpp +++ b/source/loader/layers/validation/backtrace_win.cpp @@ -9,8 +9,9 @@ */ #include "backtrace.hpp" -#include #include +// Windows.h must be included before DbgHelp.h +#include #include namespace ur_validation_layer { From eacde9b6c45e5aa05b658cce5e83f8b2db64cf0d Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Fri, 19 Jan 2024 15:59:50 +0000 Subject: [PATCH 7/9] Merge pull request #1257 from pbalcer/fix-native-handle-waits [L0] fix waiting on non-owned native handle events --- source/adapters/level_zero/event.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index d8af1e674d..a7f6df0804 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -559,13 +559,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait( ///< events to wait for completion ) { for (uint32_t I = 0; I < NumEvents; I++) { - if (EventWaitList[I]->UrQueue->ZeEventsScope == OnDemandHostVisibleProxy) { + auto e = EventWaitList[I]; + if (e->UrQueue && e->UrQueue->ZeEventsScope == OnDemandHostVisibleProxy) { // Make sure to add all host-visible "proxy" event signals if needed. // This ensures that all signalling commands are submitted below and // thus proxy events can be waited without a deadlock. // - ur_event_handle_t_ *Event = - ur_cast(EventWaitList[I]); + ur_event_handle_t_ *Event = ur_cast(e); if (!Event->hasExternalRefs()) die("urEventsWait must not be called for an internal event"); @@ -716,6 +716,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( Context, UR_EXT_COMMAND_TYPE_USER, Properties->isNativeHandleOwned); + UREvent->RefCountExternal++; + } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { From 9f79920ecdb2c2c077a96d77625d8441d0381091 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Mon, 22 Jan 2024 12:30:12 +0000 Subject: [PATCH 8/9] Merge pull request #1259 from igchor/fix_sync [L0] do not ignore returned values from zeHostSynchronize --- source/adapters/level_zero/event.cpp | 2 +- source/adapters/level_zero/queue.cpp | 8 ++++---- source/adapters/level_zero/queue.hpp | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index a7f6df0804..3cfac2cb45 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -107,7 +107,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( /* IsInternal */ false)); } - Queue->synchronize(); + UR_CALL(Queue->synchronize()); if (OutEvent) { Queue->LastCommandEvent = reinterpret_cast(*OutEvent); diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index f07e0df675..8acee473e7 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -438,7 +438,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease( return Res; // Make sure all commands get executed. - Queue->synchronize(); + UR_CALL(Queue->synchronize()); // Destroy all the fences created associated with this queue. for (auto it = Queue->CommandListMap.begin(); @@ -654,7 +654,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish( // Lock automatically releases when this goes out of scope. std::scoped_lock Lock(UrQueue->Mutex); - UrQueue->synchronize(); + UR_CALL(UrQueue->synchronize()); } else { std::unique_lock Lock(UrQueue->Mutex); std::vector ZeQueues; @@ -1241,7 +1241,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, // Check global control to make every command blocking for debugging. if (IsBlocking || (UrL0Serialize & UrL0SerializeBlock) != 0) { if (UsingImmCmdLists) { - synchronize(); + UR_CALL(synchronize()); } else { // Wait until command lists attached to the command queue are executed. ZE2UR_CALL(zeHostSynchronize, (ZeCommandQueue)); @@ -1445,7 +1445,7 @@ ur_result_t ur_queue_handle_t_::synchronize() { for (auto &QueueGroup : QueueMap) { if (UsingImmCmdLists) { for (auto &ImmCmdList : QueueGroup.second.ImmCmdLists) - syncImmCmdList(this, ImmCmdList); + UR_CALL(syncImmCmdList(this, ImmCmdList)); } else { for (auto &ZeQueue : QueueGroup.second.ZeQueues) if (ZeQueue) diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 88281925ce..8022c45e0e 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -408,7 +408,7 @@ struct ur_queue_handle_t_ : _ur_object { bool isImmediateSubmission() const; // Wait for all commandlists associated with this Queue to finish operations. - ur_result_t synchronize(); + [[nodiscard]] ur_result_t synchronize(); // Get event from the queue's cache. // Returns nullptr if the cache doesn't contain any reusable events or if the From 2d27a62757ba2f2f4a47ed0a0e86254081634419 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Mon, 22 Jan 2024 12:31:47 +0000 Subject: [PATCH 9/9] Set version to v0.8.3 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 631699a9c5..fda388c7b9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception cmake_minimum_required(VERSION 3.14.0 FATAL_ERROR) -project(unified-runtime VERSION 0.8.2) +project(unified-runtime VERSION 0.8.3) include(GNUInstallDirs) include(CheckCXXSourceCompiles)