Skip to content

Commit

Permalink
Merge branch 'main' into remove_opencl_assertions
Browse files Browse the repository at this point in the history
  • Loading branch information
lbushi25 authored Jun 27, 2024
2 parents 69d536a + 187c2fa commit 4ee46be
Show file tree
Hide file tree
Showing 28 changed files with 877 additions and 195 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/e2e_core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,11 @@ jobs:
echo "LIT_XFAIL=${{inputs.xfail}}" >> $GITHUB_ENV
echo "LIT_FILTER_OUT=${{inputs.filter_out}}" >> $GITHUB_ENV
# TODO: remove once intel/llvm lit tests can properly recognize the GPU
- name: Configure hardware platform feature for L0
if: matrix.adapter.name == 'L0'
run: sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc-1T")' build-e2e/lit.site.cfg.py

- name: Run e2e tests
id: tests
run: ninja -C build-e2e check-sycl-e2e
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/e2e_level_zero.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ jobs:
config: ""
unit: "gpu"
# Failing tests
xfail: "ESIMD/preemption.cpp;syclcompat/atomic/atomic_class.cpp;ProgramManager/uneven_kernel_split.cpp;Plugin/level_zero_ext_intel_queue_index.cpp;Plugin/level_zero_ext_intel_cslice.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp;Matrix/element_wise_ops.cpp;Matrix/element_wise_all_ops.cpp;Matrix/SG32/element_wise_all_ops.cpp"
xfail: "ESIMD/preemption.cpp;Matrix/SG32/element_wise_all_ops.cpp;Matrix/SG32/get_coord_int8_matB.cpp;Matrix/element_wise_all_ops.cpp;Matrix/element_wise_all_ops_1d.cpp;Matrix/element_wise_all_ops_1d_cont.cpp;Matrix/element_wise_all_ops_scalar.cpp;Matrix/element_wise_ops.cpp;Matrix/get_coord_int8_matB.cpp;Matrix/joint_matrix_apply_bf16.cpp;Matrix/joint_matrix_apply_two_matrices.cpp;Matrix/joint_matrix_bfloat16.cpp;Matrix/joint_matrix_bfloat16_array.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp;ProgramManager/uneven_kernel_split.cpp"
# Flaky tests
filter_out: "GroupAlgorithm/root_group.cpp|Basic/exceptions-SYCL-2020.cpp|Graph/UnsupportedDevice/device_query.cpp|Graph/RecordReplay/exception_inconsistent_contexts.cpp"
# These runners by default spawn upwards of 260 workers. That's too much for the GPU.
filter_out: "UserDefinedReductions/user_defined_reductions.cpp"
# These runners by default spawn upwards of 260 workers.
# We also add a time out just in case some test hangs
extra_lit_flags: "-sv -j 50 --max-time 600"
extra_lit_flags: "-sv -j 100 --max-time 600"
33 changes: 19 additions & 14 deletions source/adapters/cuda/image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -773,9 +773,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
}
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
CUDA_MEMCPY2D cpy_desc = {};
cpy_desc.srcXInBytes = srcOffset.x;
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
cpy_desc.srcY = srcOffset.y;
cpy_desc.dstXInBytes = dstOffset.x;
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
cpy_desc.dstY = dstOffset.y;
if (pImageDesc->rowPitch == 0) {
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
Expand All @@ -788,21 +788,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
}
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
cpy_desc.dstHost = pDst;
cpy_desc.dstPitch = hostExtent.width * PixelSizeBytes;
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
cpy_desc.Height = copyExtent.height;
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
CUDA_MEMCPY3D cpy_desc = {};
cpy_desc.srcXInBytes = srcOffset.x;
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
cpy_desc.srcY = srcOffset.y;
cpy_desc.srcZ = srcOffset.z;
cpy_desc.dstXInBytes = dstOffset.x;
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
cpy_desc.dstY = dstOffset.y;
cpy_desc.dstZ = dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.srcArray = (CUarray)pSrc;
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
cpy_desc.dstHost = pDst;
cpy_desc.dstPitch = hostExtent.width * PixelSizeBytes;
cpy_desc.dstHeight = hostExtent.height;
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
cpy_desc.Height = copyExtent.height;
cpy_desc.Depth = copyExtent.depth;
Expand All @@ -811,16 +814,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
pImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY ||
pImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) {
CUDA_MEMCPY3D cpy_desc = {};
cpy_desc.srcXInBytes = srcOffset.x;
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
cpy_desc.srcY = srcOffset.y;
cpy_desc.srcZ = srcOffset.z;
cpy_desc.dstXInBytes = dstOffset.x;
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
cpy_desc.dstY = dstOffset.y;
cpy_desc.dstZ = dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.srcArray = (CUarray)pSrc;
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
cpy_desc.dstHost = pDst;
cpy_desc.dstPitch = hostExtent.width * PixelSizeBytes;
cpy_desc.dstHeight = hostExtent.height;
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
cpy_desc.Height = std::max(uint64_t{1}, copyExtent.height);
cpy_desc.Depth = pImageDesc->arraySize;
Expand All @@ -834,9 +839,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
// the end
if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
CUDA_MEMCPY2D cpy_desc = {};
cpy_desc.srcXInBytes = srcOffset.x;
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
cpy_desc.srcY = 0;
cpy_desc.dstXInBytes = dstOffset.x;
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
cpy_desc.dstY = 0;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.srcArray = (CUarray)pSrc;
Expand All @@ -847,9 +852,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
CUDA_MEMCPY2D cpy_desc = {};
cpy_desc.srcXInBytes = srcOffset.x;
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
cpy_desc.srcY = srcOffset.y;
cpy_desc.dstXInBytes = dstOffset.x;
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
cpy_desc.dstY = dstOffset.y;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
cpy_desc.srcArray = (CUarray)pSrc;
Expand All @@ -860,10 +865,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
CUDA_MEMCPY3D cpy_desc = {};
cpy_desc.srcXInBytes = srcOffset.x;
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
cpy_desc.srcY = srcOffset.y;
cpy_desc.srcZ = srcOffset.z;
cpy_desc.dstXInBytes = dstOffset.x;
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
cpy_desc.dstY = dstOffset.y;
cpy_desc.dstZ = dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
Expand All @@ -878,10 +883,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
pImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY ||
pImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) {
CUDA_MEMCPY3D cpy_desc = {};
cpy_desc.srcXInBytes = srcOffset.x;
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
cpy_desc.srcY = srcOffset.y;
cpy_desc.srcZ = srcOffset.z;
cpy_desc.dstXInBytes = dstOffset.x;
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
cpy_desc.dstY = dstOffset.y;
cpy_desc.dstZ = dstOffset.z;
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
Expand Down
3 changes: 3 additions & 0 deletions source/adapters/level_zero/adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ ur_result_t initPlatforms(PlatformVec &platforms) noexcept try {
for (uint32_t I = 0; I < ZeDriverCount; ++I) {
auto platform = std::make_unique<ur_platform_handle_t_>(ZeDrivers[I]);
UR_CALL(platform->initialize());
ZE2UR_CALL(zelLoaderTranslateHandle,
(ZEL_HANDLE_DRIVER, platform->ZeDriver,
(void **)&platform->ZeDriverHandleExpTranslated));

// Save a copy in the cache for future uses.
platforms.push_back(std::move(platform));
Expand Down
1 change: 1 addition & 0 deletions source/adapters/level_zero/adapter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include "logger/ur_logger.hpp"
#include <atomic>
#include <loader/ze_loader.h>
#include <mutex>
#include <optional>
#include <ur/ur.hpp>
Expand Down
22 changes: 15 additions & 7 deletions source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,14 @@ bool PreferCopyEngineForFill = [] {
ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
ur_context_handle_t Context, ur_device_handle_t Device,
ze_command_list_handle_t CommandList,
ze_command_list_handle_t CommandListTranslated,
ze_command_list_handle_t CommandListResetEvents,
ze_command_list_handle_t CopyCommandList,
ZeStruct<ze_command_list_desc_t> ZeDesc,
ZeStruct<ze_command_list_desc_t> ZeCopyDesc,
const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList)
: Context(Context), Device(Device), ZeComputeCommandList(CommandList),
ZeComputeCommandListTranslated(CommandListTranslated),
ZeCommandListResetEvents(CommandListResetEvents),
ZeCommandListDesc(ZeDesc), ZeCopyCommandList(CopyCommandList),
ZeCopyCommandListDesc(ZeCopyDesc), ZeFencesMap(), ZeActiveFence(nullptr),
Expand Down Expand Up @@ -605,11 +607,16 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
&ZeCopyCommandList));
}

ze_command_list_handle_t ZeComputeCommandListTranslated = nullptr;
ZE2UR_CALL(zelLoaderTranslateHandle,
(ZEL_HANDLE_COMMAND_LIST, ZeComputeCommandList,
(void **)&ZeComputeCommandListTranslated));

try {
*CommandBuffer = new ur_exp_command_buffer_handle_t_(
Context, Device, ZeComputeCommandList, ZeCommandListResetEvents,
ZeCopyCommandList, ZeCommandListDesc, ZeCopyCommandListDesc,
CommandBufferDesc, IsInOrder);
Context, Device, ZeComputeCommandList, ZeComputeCommandListTranslated,
ZeCommandListResetEvents, ZeCopyCommandList, ZeCommandListDesc,
ZeCopyCommandListDesc, CommandBufferDesc, IsInOrder);
} catch (const std::bad_alloc &) {
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
} catch (...) {
Expand Down Expand Up @@ -791,8 +798,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
UR_ASSERT(Plt->ZeMutableCmdListExt.Supported,
UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp,
(CommandBuffer->ZeComputeCommandList, &ZeMutableCommandDesc,
&CommandId));
(CommandBuffer->ZeComputeCommandListTranslated,
&ZeMutableCommandDesc, &CommandId));
DEBUG_LOG(CommandId);
}
try {
Expand Down Expand Up @@ -1619,8 +1626,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
auto Plt = CommandBuffer->Context->getPlatform();
UR_ASSERT(Plt->ZeMutableCmdListExt.Supported,
UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp,
(CommandBuffer->ZeComputeCommandList, &MutableCommandDesc));
ZE2UR_CALL(
Plt->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp,
(CommandBuffer->ZeComputeCommandListTranslated, &MutableCommandDesc));
ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeComputeCommandList));

return UR_RESULT_SUCCESS;
Expand Down
4 changes: 4 additions & 0 deletions source/adapters/level_zero/command_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {
ur_exp_command_buffer_handle_t_(
ur_context_handle_t Context, ur_device_handle_t Device,
ze_command_list_handle_t CommandList,
ze_command_list_handle_t CommandListTranslated,
ze_command_list_handle_t CommandListResetEvents,
ze_command_list_handle_t CopyCommandList,
ZeStruct<ze_command_list_desc_t> ZeDesc,
Expand Down Expand Up @@ -55,6 +56,9 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {
ur_device_handle_t Device;
// Level Zero command list handle
ze_command_list_handle_t ZeComputeCommandList;
// Given a multi driver scenario, the driver handle must be translated to the
// internal driver handle to allow calls to driver experimental apis.
ze_command_list_handle_t ZeComputeCommandListTranslated;
// Level Zero command list handle
ze_command_list_handle_t ZeCommandListResetEvents;
// Level Zero command list descriptor
Expand Down
3 changes: 0 additions & 3 deletions source/adapters/level_zero/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,9 +66,6 @@ ur_result_t ze2urResult(ze_result_t ZeResult) {
}
}

usm::DisjointPoolAllConfigs DisjointPoolConfigInstance =
InitializeDisjointPoolConfig();

// This function will ensure compatibility with both Linux and Windows for
// setting environment variables.
bool setEnvVar(const char *name, const char *value) {
Expand Down
8 changes: 8 additions & 0 deletions source/adapters/level_zero/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,14 @@ ur_result_t ze2urResult(ze_result_t ZeResult);
return ze2urResult(Result); \
}

// Trace a call to Level-Zero RT, throw on error
#define ZE2UR_CALL_THROWS(ZeName, ZeArgs) \
{ \
ze_result_t ZeResult = ZeName ZeArgs; \
if (auto Result = ZeCall().doCall(ZeResult, #ZeName, #ZeArgs, true)) \
throw ze2urResult(Result); \
}

// Perform traced call to L0 without checking for errors
#define ZE_CALL_NOCHECK(ZeName, ZeArgs) \
ZeCall().doCall(ZeName ZeArgs, #ZeName, #ZeArgs, false)
Expand Down
11 changes: 9 additions & 2 deletions source/adapters/level_zero/image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,11 @@ ur_result_t bindlessImagesCreateImpl(ur_context_handle_t hContext,
return UR_RESULT_ERROR_INVALID_OPERATION;

uint64_t DeviceOffset{};
ZE2UR_CALL(zeImageGetDeviceOffsetExpFunctionPtr, (ZeImage, &DeviceOffset));
ze_image_handle_t ZeImageTranslated;
ZE2UR_CALL(zelLoaderTranslateHandle,
(ZEL_HANDLE_IMAGE, ZeImage, (void **)&ZeImageTranslated));
ZE2UR_CALL(zeImageGetDeviceOffsetExpFunctionPtr,
(ZeImageTranslated, &DeviceOffset));
*phImage = reinterpret_cast<ur_exp_image_handle_t>(DeviceOffset);

return UR_RESULT_SUCCESS;
Expand Down Expand Up @@ -652,8 +656,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp(

size_t Width = widthInBytes / elementSizeBytes;
size_t RowPitch;
ze_device_handle_t ZeDeviceTranslated;
ZE2UR_CALL(zelLoaderTranslateHandle, (ZEL_HANDLE_DEVICE, hDevice->ZeDevice,
(void **)&ZeDeviceTranslated));
ZE2UR_CALL(zeMemGetPitchFor2dImageFunctionPtr,
(hContext->ZeContext, hDevice->ZeDevice, Width, height,
(hContext->ZeContext, ZeDeviceTranslated, Width, height,
elementSizeBytes, &RowPitch));
*pResultPitch = RowPitch;

Expand Down
6 changes: 4 additions & 2 deletions source/adapters/level_zero/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1668,7 +1668,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
// If not shared of any type, we can import the ptr
if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) {
// Promote the host ptr to USM host memory
ze_driver_handle_t driverHandle = Context->getPlatform()->ZeDriver;
ze_driver_handle_t driverHandle =
Context->getPlatform()->ZeDriverHandleExpTranslated;
ZeUSMImport.doZeUSMImport(driverHandle, Host, Size);
HostPtrImported = true;
}
Expand Down Expand Up @@ -2252,7 +2253,8 @@ ur_result_t _ur_buffer::free() {
UR_CALL(ZeMemFreeHelper(UrContext, ZeHandle));
break;
case allocation_t::unimport:
ZeUSMImport.doZeUSMRelease(UrContext->getPlatform()->ZeDriver, ZeHandle);
ZeUSMImport.doZeUSMRelease(
UrContext->getPlatform()->ZeDriverHandleExpTranslated, ZeHandle);
break;
default:
die("_ur_buffer::free(): Unhandled release action");
Expand Down
4 changes: 4 additions & 0 deletions source/adapters/level_zero/platform.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ struct ur_platform_handle_t_ : public _ur_platform {
// a pretty good fit to keep here.
ze_driver_handle_t ZeDriver;

// Given a multi driver scenario, the driver handle must be translated to the
// internal driver handle to allow calls to driver experimental apis.
ze_driver_handle_t ZeDriverHandleExpTranslated;

// Cache versions info from zeDriverGetProperties.
std::string ZeDriverVersion;
std::string ZeDriverApiVersion;
Expand Down
1 change: 1 addition & 0 deletions source/adapters/level_zero/ur_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <unordered_map>
#include <vector>

#include <loader/ze_loader.h>
#include <ur/ur.hpp>
#include <ur_api.h>
#include <ze_api.h>
Expand Down
9 changes: 7 additions & 2 deletions source/adapters/level_zero/usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@

#include <umf_helpers.hpp>

usm::DisjointPoolAllConfigs DisjointPoolConfigInstance =
InitializeDisjointPoolConfig();

ur_result_t umf2urResult(umf_result_t umfResult) {
if (umfResult == UMF_RESULT_SUCCESS)
return UR_RESULT_SUCCESS;
Expand Down Expand Up @@ -1026,7 +1029,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMImportExp(ur_context_handle_t Context,
// If not shared of any type, we can import the ptr
if (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_UNKNOWN) {
// Promote the host ptr to USM host memory
ze_driver_handle_t driverHandle = Context->getPlatform()->ZeDriver;
ze_driver_handle_t driverHandle =
Context->getPlatform()->ZeDriverHandleExpTranslated;
ZeUSMImport.doZeUSMImport(driverHandle, HostPtr, Size);
}
}
Expand All @@ -1039,6 +1043,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMReleaseExp(ur_context_handle_t Context,

// Release the imported memory.
if (ZeUSMImport.Supported && HostPtr != nullptr)
ZeUSMImport.doZeUSMRelease(Context->getPlatform()->ZeDriver, HostPtr);
ZeUSMImport.doZeUSMRelease(
Context->getPlatform()->ZeDriverHandleExpTranslated, HostPtr);
return UR_RESULT_SUCCESS;
}
Loading

0 comments on commit 4ee46be

Please sign in to comment.