Skip to content

Commit

Permalink
[L0] Only Override max allocation limits given env
Browse files Browse the repository at this point in the history
- Change the defaults from always allowing > 4GB allocations to making the user have to request > 4GB
  allocation support given the max allocation allowed on that system is less than 4GB.
- This ensures performance is maintained on systems that dont handle > 4GB allocations natively and
  avoids breaking Ahead of Time (AOT) binaries that were built without > 4GB resource support.
- By setting UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 , the L0 Adapter
  will force the modules to be built with stateless or > 4GB support and
  will allow for the allocations to exceed the max single allocation size limit
  for that device.

Signed-off-by: Spruit, Neil R <neil.r.spruit@intel.com>
  • Loading branch information
nrspruit committed Jan 12, 2024
1 parent d06ba9d commit aaf71cd
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 20 deletions.
26 changes: 11 additions & 15 deletions source/adapters/level_zero/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <algorithm>
#include <climits>
#include <optional>
#include "ur_util.hpp"

UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(
ur_platform_handle_t Platform, ///< [in] handle of the platform instance
Expand Down Expand Up @@ -268,9 +269,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
return ReturnValue(uint32_t{64});
}
case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
// if not optimized for 32-bit access, return total memory size.
// otherwise, return only maximum allocatable size.
if (Device->useOptimized32bitAccess() == 0) {
// if the user wishes to allocate large allocations on a system that usually
// does not allow that allocation size, then we return the max global mem
// size as the limit.
if (Device->useRelaxedAllocationLimits()) {
return ReturnValue(uint64_t{calculateGlobalMemSize(Device)});
} else {
return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
Expand Down Expand Up @@ -923,20 +925,14 @@ ur_device_handle_t_::useImmediateCommandLists() {
}
}

int32_t ur_device_handle_t_::useOptimized32bitAccess() {
static const int32_t Optimize32bitAccessMode = [this] {
// If device is Intel(R) Data Center GPU Max,
// use default provided by L0 driver.
// TODO: Use IP versioning to select based on range of devices
if (this->isPVC())
return -1;
const char *UrRet = std::getenv("UR_L0_USE_OPTIMIZED_32BIT_ACCESS");
if (!UrRet)
return 0;
return std::atoi(UrRet);
bool ur_device_handle_t_::useRelaxedAllocationLimits() {
static const bool EnableRelaxedAllocationLimits = [] {
auto UrRet = ur_getenv("UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS");
const bool RetVal = UrRet ? std::stoi(*UrRet) : 0;
return RetVal;
}();

return Optimize32bitAccessMode;
return EnableRelaxedAllocationLimits;
}

ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
Expand Down
2 changes: 1 addition & 1 deletion source/adapters/level_zero/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ struct ur_device_handle_t_ : _ur_object {
// provide support for only one, like for Intel(R)
// Data Center GPU Max, for which L0 driver only
// supports stateless.
int32_t useOptimized32bitAccess();
bool useRelaxedAllocationLimits();

bool isSubDevice() { return RootDevice != nullptr; }

Expand Down
4 changes: 2 additions & 2 deletions source/adapters/level_zero/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(
ZeBuildOptions += pOptions;
}

if (phDevices[0]->useOptimized32bitAccess() == 0) {
if (phDevices[0]->useRelaxedAllocationLimits()) {
ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required";
}

Expand Down Expand Up @@ -256,7 +256,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(
// ze-opt-greater-than-4GB-buffer-required to disable
// stateful optimizations and be able to use larger than
// 4GB allocations on these kernels.
if (Context->Devices[0]->useOptimized32bitAccess() == 0) {
if (Context->Devices[0]->useRelaxedAllocationLimits()) {
Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required";
}
}
Expand Down
4 changes: 2 additions & 2 deletions source/adapters/level_zero/usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,11 +178,11 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr,
ZeDesc.flags = 0;
ZeDesc.ordinal = 0;

if (Device->useOptimized32bitAccess() == 0 &&
ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
if (Device->useRelaxedAllocationLimits() &&
(Size > Device->ZeDeviceProperties->maxMemAllocSize)) {
// Tell Level-Zero to accept Size > maxMemAllocSize if
// large allocations are used.
ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
ZeDesc.pNext = &RelaxedDesc;
}
Expand Down

0 comments on commit aaf71cd

Please sign in to comment.