Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Candidate for the v0.10.9 release tag #2157

Merged
merged 4 commits into from
Oct 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR)
project(unified-runtime VERSION 0.10.8)
project(unified-runtime VERSION 0.10.9)

# Check if unified runtime is built as a standalone project.
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR UR_STANDALONE_BUILD)
Expand Down
38 changes: 17 additions & 21 deletions source/adapters/cuda/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,14 +354,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);

CUgraphNode GraphNode;
try {
CUgraphNode GraphNode;

std::vector<CUgraphNode> DepsList;
UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
pSyncPointWaitList, DepsList));
std::vector<CUgraphNode> DepsList;
UR_CHECK_ERROR(getNodesFromSyncPoints(
hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));

if (*pGlobalWorkSize == 0) {
try {
if (*pGlobalWorkSize == 0) {
// Create an empty node if the kernel workload size is zero
UR_CHECK_ERROR(cuGraphAddEmptyNode(&GraphNode, hCommandBuffer->CudaGraph,
DepsList.data(), DepsList.size()));
Expand All @@ -371,25 +371,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
if (pSyncPoint) {
*pSyncPoint = SyncPoint;
}
} catch (ur_result_t Err) {
return Err;
return UR_RESULT_SUCCESS;
}
return UR_RESULT_SUCCESS;
}

// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};

uint32_t LocalSize = hKernel->getLocalSize();
CUfunction CuFunc = hKernel->get();
UR_CHECK_ERROR(
setKernelParams(hCommandBuffer->Context, hCommandBuffer->Device, workDim,
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid));
uint32_t LocalSize = hKernel->getLocalSize();
CUfunction CuFunc = hKernel->get();
UR_CHECK_ERROR(setKernelParams(
hCommandBuffer->Context, hCommandBuffer->Device, workDim,
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, hKernel, CuFunc,
ThreadsPerBlock, BlocksPerGrid));

try {
// Set node param structure with the kernel related data
auto &ArgIndices = hKernel->getArgIndices();
CUDA_KERNEL_NODE_PARAMS NodeParams = {};
Expand Down
36 changes: 16 additions & 20 deletions source/adapters/hip/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -324,14 +324,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
UR_ASSERT(!(pSyncPointWaitList == NULL && numSyncPointsInWaitList > 0),
UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);

hipGraphNode_t GraphNode;
std::vector<hipGraphNode_t> DepsList;
try {
hipGraphNode_t GraphNode;
std::vector<hipGraphNode_t> DepsList;

UR_CHECK_ERROR(getNodesFromSyncPoints(hCommandBuffer, numSyncPointsInWaitList,
pSyncPointWaitList, DepsList));
UR_CHECK_ERROR(getNodesFromSyncPoints(
hCommandBuffer, numSyncPointsInWaitList, pSyncPointWaitList, DepsList));

if (*pGlobalWorkSize == 0) {
try {
if (*pGlobalWorkSize == 0) {
// Create an empty node if the kernel workload size is zero
UR_CHECK_ERROR(hipGraphAddEmptyNode(&GraphNode, hCommandBuffer->HIPGraph,
DepsList.data(), DepsList.size()));
Expand All @@ -341,24 +341,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
if (pSyncPoint) {
*pSyncPoint = SyncPoint;
}
} catch (ur_result_t Err) {
return Err;
return UR_RESULT_SUCCESS;
}
return UR_RESULT_SUCCESS;
}

// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {64u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {64u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};

uint32_t LocalSize = hKernel->getLocalSize();
hipFunction_t HIPFunc = hKernel->get();
UR_CHECK_ERROR(setKernelParams(
hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize,
pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid));
uint32_t LocalSize = hKernel->getLocalSize();
hipFunction_t HIPFunc = hKernel->get();
UR_CHECK_ERROR(setKernelParams(
hCommandBuffer->Device, workDim, pGlobalWorkOffset, pGlobalWorkSize,
pLocalWorkSize, hKernel, HIPFunc, ThreadsPerBlock, BlocksPerGrid));

try {
// Set node param structure with the kernel related data
auto &ArgIndices = hKernel->getArgIndices();
hipKernelNodeParams NodeParams;
Expand Down
92 changes: 82 additions & 10 deletions source/adapters/level_zero/adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,32 @@ class ur_legacy_sink : public logger::Sink {
~ur_legacy_sink() = default;
};

ur_result_t initPlatforms(PlatformVec &platforms) noexcept try {
// Find the corresponding ZesDevice Handle for a given ZeDevice
ur_result_t getZesDeviceHandle(zes_uuid_t coreDeviceUuid,
zes_device_handle_t *ZesDevice,
uint32_t *SubDeviceId, ze_bool_t *SubDevice) {
uint32_t ZesDriverCount = 0;
std::vector<zes_driver_handle_t> ZesDrivers;
std::vector<zes_device_handle_t> ZesDevices;
ze_result_t ZesResult = ZE_RESULT_ERROR_INVALID_ARGUMENT;
ZE2UR_CALL(GlobalAdapter->getSysManDriversFunctionPtr,
(&ZesDriverCount, nullptr));
ZesDrivers.resize(ZesDriverCount);
ZE2UR_CALL(GlobalAdapter->getSysManDriversFunctionPtr,
(&ZesDriverCount, ZesDrivers.data()));
for (uint32_t I = 0; I < ZesDriverCount; ++I) {
ZesResult = ZE_CALL_NOCHECK(
GlobalAdapter->getDeviceByUUIdFunctionPtr,
(ZesDrivers[I], coreDeviceUuid, ZesDevice, SubDevice, SubDeviceId));
if (ZesResult == ZE_RESULT_SUCCESS) {
return UR_RESULT_SUCCESS;
}
}
return UR_RESULT_ERROR_INVALID_ARGUMENT;
}

ur_result_t initPlatforms(PlatformVec &platforms,
ze_result_t ZesResult) noexcept try {
uint32_t ZeDriverCount = 0;
ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, nullptr));
if (ZeDriverCount == 0) {
Expand All @@ -48,24 +73,43 @@ ur_result_t initPlatforms(PlatformVec &platforms) noexcept try {

ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data()));
for (uint32_t I = 0; I < ZeDriverCount; ++I) {
// Keep track of the first platform init for this Driver
bool DriverPlatformInit = false;
ze_device_properties_t device_properties{};
device_properties.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES;
uint32_t ZeDeviceCount = 0;
ZE2UR_CALL(zeDeviceGet, (ZeDrivers[I], &ZeDeviceCount, nullptr));
ZeDevices.resize(ZeDeviceCount);
ZE2UR_CALL(zeDeviceGet, (ZeDrivers[I], &ZeDeviceCount, ZeDevices.data()));
auto platform = std::make_unique<ur_platform_handle_t_>(ZeDrivers[I]);
// Check if this driver has GPU Devices
for (uint32_t D = 0; D < ZeDeviceCount; ++D) {
ZE2UR_CALL(zeDeviceGetProperties, (ZeDevices[D], &device_properties));

if (ZE_DEVICE_TYPE_GPU == device_properties.type) {
// If this Driver is a GPU, save it as a usable platform.
auto platform = std::make_unique<ur_platform_handle_t_>(ZeDrivers[I]);
UR_CALL(platform->initialize());

// Save a copy in the cache for future uses.
platforms.push_back(std::move(platform));
break;
// Check if this driver's platform has already been init.
if (!DriverPlatformInit) {
// If this Driver is a GPU, save it as a usable platform.
UR_CALL(platform->initialize());

// Save a copy in the cache for future uses.
platforms.push_back(std::move(platform));
// Mark this driver's platform as init to prevent additional platforms
// from being created per driver.
DriverPlatformInit = true;
}
if (ZesResult == ZE_RESULT_SUCCESS) {
// Populate the Zes/Ze device mapping for this Ze Device into the last
// added platform which represents the current driver being queried.
ur_zes_device_handle_data_t ZesDeviceData;
zes_uuid_t ZesUUID;
std::memcpy(&ZesUUID, &device_properties.uuid, sizeof(zes_uuid_t));
if (getZesDeviceHandle(
ZesUUID, &ZesDeviceData.ZesDevice, &ZesDeviceData.SubDeviceId,
&ZesDeviceData.SubDevice) == UR_RESULT_SUCCESS) {
platforms.back()->ZedeviceToZesDeviceMap.insert(
std::make_pair(ZeDevices[D], std::move(ZesDeviceData)));
}
}
}
}
}
Expand Down Expand Up @@ -147,8 +191,36 @@ ur_adapter_handle_t_::ur_adapter_handle_t_()

return;
}
// Dynamically load the new L0 SysMan separate init and new EXP apis
// separately. This must be done to avoid attempting to use symbols that do
// not exist in older loader runtimes.
#ifdef _WIN32
HMODULE processHandle = GetModuleHandle(NULL);
#else
HMODULE processHandle = nullptr;
#endif
GlobalAdapter->getDeviceByUUIdFunctionPtr =
(zes_pfnDriverGetDeviceByUuidExp_t)ur_loader::LibLoader::getFunctionPtr(
processHandle, "zesDriverGetDeviceByUuidExp");
GlobalAdapter->getSysManDriversFunctionPtr =
(zes_pfnDriverGet_t)ur_loader::LibLoader::getFunctionPtr(
processHandle, "zesDriverGet");
GlobalAdapter->sysManInitFunctionPtr =
(zes_pfnInit_t)ur_loader::LibLoader::getFunctionPtr(processHandle,
"zesInit");
if (GlobalAdapter->getDeviceByUUIdFunctionPtr &&
GlobalAdapter->getSysManDriversFunctionPtr &&
GlobalAdapter->sysManInitFunctionPtr) {
ze_init_flags_t L0ZesInitFlags = 0;
logger::debug("\nzesInit with flags value of {}\n",
static_cast<int>(L0ZesInitFlags));
GlobalAdapter->ZesResult = ZE_CALL_NOCHECK(
GlobalAdapter->sysManInitFunctionPtr, (L0ZesInitFlags));
} else {
GlobalAdapter->ZesResult = ZE_RESULT_ERROR_UNINITIALIZED;
}

ur_result_t err = initPlatforms(platforms);
ur_result_t err = initPlatforms(platforms, *GlobalAdapter->ZesResult);
if (err == UR_RESULT_SUCCESS) {
result = std::move(platforms);
} else {
Expand Down
7 changes: 7 additions & 0 deletions source/adapters/level_zero/adapter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,13 @@

#include "logger/ur_logger.hpp"
#include <atomic>
#include <loader/ur_loader.hpp>
#include <loader/ze_loader.h>
#include <mutex>
#include <optional>
#include <ur/ur.hpp>
#include <ze_api.h>
#include <zes_ddi.h>

using PlatformVec = std::vector<std::unique_ptr<ur_platform_handle_t_>>;

Expand All @@ -26,7 +28,12 @@ struct ur_adapter_handle_t_ {
std::atomic<uint32_t> RefCount = 0;
std::mutex Mutex;

zes_pfnDriverGetDeviceByUuidExp_t getDeviceByUUIdFunctionPtr = nullptr;
zes_pfnDriverGet_t getSysManDriversFunctionPtr = nullptr;
zes_pfnInit_t sysManInitFunctionPtr = nullptr;

std::optional<ze_result_t> ZeResult;
std::optional<ze_result_t> ZesResult;
ZeCache<Result<PlatformVec>> PlatformCache;
logger::Logger &logger;
};
Expand Down
22 changes: 11 additions & 11 deletions source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,7 @@ setKernelPendingArguments(ur_exp_command_buffer_handle_t CommandBuffer,
char **ZeHandlePtr = nullptr;
if (Arg.Value) {
UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode,
CommandBuffer->Device));
CommandBuffer->Device, nullptr, 0u));
}
ZE2UR_CALL(zeKernelSetArgumentValue,
(Kernel->ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
Expand Down Expand Up @@ -950,10 +950,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(

char *ZeHandleSrc;
UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
CommandBuffer->Device));
CommandBuffer->Device, nullptr, 0u));
char *ZeHandleDst;
UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
CommandBuffer->Device));
CommandBuffer->Device, nullptr, 0u));

bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost);

Expand Down Expand Up @@ -982,10 +982,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(

char *ZeHandleSrc;
UR_CALL(SrcBuffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
CommandBuffer->Device));
CommandBuffer->Device, nullptr, 0u));
char *ZeHandleDst;
UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
CommandBuffer->Device));
CommandBuffer->Device, nullptr, 0u));

bool PreferCopyEngine = (SrcBuffer->OnHost || DstBuffer->OnHost);

Expand All @@ -1008,7 +1008,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(

char *ZeHandleDst = nullptr;
UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
CommandBuffer->Device));
CommandBuffer->Device, nullptr, 0u));
// Always prefer copy engine for writes
bool PreferCopyEngine = true;

Expand All @@ -1032,7 +1032,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(

char *ZeHandleDst = nullptr;
UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
CommandBuffer->Device));
CommandBuffer->Device, nullptr, 0u));

// Always prefer copy engine for writes
bool PreferCopyEngine = true;
Expand All @@ -1054,7 +1054,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(

char *ZeHandleSrc = nullptr;
UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
CommandBuffer->Device));
CommandBuffer->Device, nullptr, 0u));

// Always prefer copy engine for reads
bool PreferCopyEngine = true;
Expand All @@ -1077,7 +1077,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(

char *ZeHandleSrc;
UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only,
CommandBuffer->Device));
CommandBuffer->Device, nullptr, 0u));

// Always prefer copy engine for reads
bool PreferCopyEngine = true;
Expand Down Expand Up @@ -1202,7 +1202,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
char *ZeHandleDst = nullptr;
_ur_buffer *UrBuffer = reinterpret_cast<_ur_buffer *>(Buffer);
UR_CALL(UrBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
CommandBuffer->Device));
CommandBuffer->Device, nullptr, 0u));

return enqueueCommandBufferFillHelper(
UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, ZeHandleDst + Offset,
Expand Down Expand Up @@ -1654,7 +1654,7 @@ ur_result_t updateKernelCommand(
char **ZeHandlePtr = nullptr;
if (NewMemObjArg) {
UR_CALL(NewMemObjArg->getZeHandlePtr(ZeHandlePtr, UrAccessMode,
CommandBuffer->Device));
CommandBuffer->Device, nullptr, 0u));
}

auto ZeMutableArgDesc =
Expand Down
Loading
Loading