From 3cfe9b76843d7980946d6af4e1e421ccf7d6ff50 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Thu, 19 Sep 2024 16:23:12 -0700 Subject: [PATCH] [SYCL] Support sycl::kernel_bundle for multi-device scenario This PR includes: * Changes in the program manager methods to be able to properly create/build UR program for multiple devices. So far, we were mostly using only the first device in the vector to create/build UR program which made UR program unusable on other devices. * For SPIRV case we already have all necessary UR fuctions for multi-device case: urProgramBuildExp, urProgramLinkExp. For AOT case we need to add new function urProgramCreateWithBinaryExp which allows to create UR program from multiple device binaries. Hence the UR tag update. * Our program cache key allowed only a single device. I have changed it to contain a set of devices. If UR program is created and built for a set of devices then the same UR program is usable whenver we have any subset of this set. That's why if we have a program built for a set of devices then add all subsets to the cache. Before we were adding a record to the cache for each device from the set which is incorrect. For example, if someone requests a UR program for {dev2, dev3} from the cache then it is expected that this UR progam must be usable to submit a kernel to dev3. But we could get a program for {dev1, dev2} from the cache which is unusable on dev3. --- sycl/cmake/modules/FetchUnifiedRuntime.cmake | 10 +- sycl/source/detail/context_impl.cpp | 3 +- sycl/source/detail/helpers.cpp | 6 +- sycl/source/detail/kernel_program_cache.hpp | 8 +- .../program_manager/program_manager.cpp | 378 +++++++++++------- .../program_manager/program_manager.hpp | 7 +- .../KernelAndProgram/cache_env_vars.cpp | 4 +- .../KernelAndProgram/cache_env_vars_lin.cpp | 4 +- .../multi_device_bundle/build_twice.cpp | 52 +++ .../multi_device_bundle/compile_link.cpp | 34 ++ .../device_libs_and_caching.cpp | 159 ++++++++ .../SYCL2020/KernelBundleStateFiltering.cpp | 9 +- .../helpers/RuntimeLinkingCommon.hpp | 7 +- .../kernel-and-program/MultipleDevsCache.cpp | 56 ++- .../program_manager/CompileTarget.cpp | 10 +- 15 files changed, 546 insertions(+), 201 deletions(-) create mode 100644 sycl/test-e2e/ProgramManager/multi_device_bundle/build_twice.cpp create mode 100644 sycl/test-e2e/ProgramManager/multi_device_bundle/compile_link.cpp create mode 100644 sycl/test-e2e/ProgramManager/multi_device_bundle/device_libs_and_caching.cpp diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 8ff2f7bce49d2..f5c62ec0a31a1 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -116,14 +116,8 @@ if(SYCL_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit ce0325da13048af40acd61bd43ef88bafd84c6b3 - # Merge: 2ad32681 668c5e5d - # Author: Piotr Balcer - # Date: Thu Sep 26 10:51:57 2024 +0200 - # Merge pull request #2134 from nrspruit/fix_multi_device_event_driver_in_order_syclos - # [L0] Fix Multi Device Event handling and remove unhandled events from in order wait list - set(UNIFIED_RUNTIME_TAG ce0325da13048af40acd61bd43ef88bafd84c6b3) + set(UNIFIED_RUNTIME_REPO "https://github.com/againull/unified-runtime") + set(UNIFIED_RUNTIME_TAG 2b1cf6ee0bb43e0a94359eda5e163445d691b42a) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") # Due to the use of dependentloadflag and no installer for UMF and hwloc we need diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index 698d0680728b1..88635248b12c1 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -21,6 +21,7 @@ #include #include +#include namespace sycl { inline namespace _V1 { @@ -490,7 +491,7 @@ std::optional context_impl::getProgramForDevImgs( auto &Cache = LockedCache.get().Cache; ur_device_handle_t &DevHandle = getSyclObjImpl(Device)->getHandleRef(); for (std::uintptr_t ImageIDs : ImgIdentifiers) { - auto OuterKey = std::make_pair(ImageIDs, DevHandle); + auto OuterKey = std::make_pair(ImageIDs, std::set{DevHandle}); size_t NProgs = KeyMap.count(OuterKey); if (NProgs == 0) continue; diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp index 8a6583d6dd741..5d5afbed51fd3 100644 --- a/sycl/source/detail/helpers.cpp +++ b/sycl/source/detail/helpers.cpp @@ -65,8 +65,8 @@ retrieveKernelBinary(const QueueImplPtr &Queue, const char *KernelName, auto DeviceImpl = Queue->getDeviceImplPtr(); auto Device = detail::createSyclObjFromImpl(DeviceImpl); ur_program_handle_t Program = - detail::ProgramManager::getInstance().createURProgram(**DeviceImage, - Context, Device); + detail::ProgramManager::getInstance().createURProgram( + **DeviceImage, Context, {Device}); return {*DeviceImage, Program}; } @@ -94,7 +94,7 @@ retrieveKernelBinary(const QueueImplPtr &Queue, const char *KernelName, DeviceImage = &detail::ProgramManager::getInstance().getDeviceImage( KernelName, Context, Device); Program = detail::ProgramManager::getInstance().createURProgram( - *DeviceImage, Context, Device); + *DeviceImage, Context, {Device}); } return {DeviceImage, Program}; } diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp index 0e6323b74e3ef..9630143c9ee51 100644 --- a/sycl/source/detail/kernel_program_cache.hpp +++ b/sycl/source/detail/kernel_program_cache.hpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -118,9 +119,10 @@ class KernelProgramCache { * when debugging environment variables are set and we can just ignore them * since all kernels will have their build options overridden with the same * string*/ - using ProgramCacheKeyT = - std::pair, ur_device_handle_t>; - using CommonProgramKeyT = std::pair; + using ProgramCacheKeyT = std::pair, + std::set>; + using CommonProgramKeyT = + std::pair>; struct ProgramCache { ::boost::unordered_map Cache; diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 2fa9e75be6f0c..bb7ca8cc37a53 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -73,30 +73,33 @@ ProgramManager &ProgramManager::getInstance() { } static ur_program_handle_t -createBinaryProgram(const ContextImplPtr Context, const device &Device, - const unsigned char *Data, size_t DataLen, +createBinaryProgram(const ContextImplPtr Context, + const std::vector &Devices, + const uint8_t **Binaries, size_t *Lengths, const std::vector Metadata) { const AdapterPtr &Adapter = Context->getAdapter(); -#ifndef _NDEBUG - uint32_t NumDevices = 0; - Adapter->call(Context->getHandleRef(), - UR_CONTEXT_INFO_NUM_DEVICES, - sizeof(NumDevices), &NumDevices, - /*param_value_size_ret=*/nullptr); - assert(NumDevices > 0 && - "Only a single device is supported for AOT compilation"); -#endif - ur_program_handle_t Program; - ur_device_handle_t UrDevice = getSyclObjImpl(Device)->getHandleRef(); + std::vector DeviceHandles; + std::transform( + Devices.begin(), Devices.end(), std::back_inserter(DeviceHandles), + [](const device &Dev) { return getSyclObjImpl(Dev)->getHandleRef(); }); ur_result_t BinaryStatus = UR_RESULT_SUCCESS; ur_program_properties_t Properties = {}; Properties.stype = UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES; Properties.pNext = nullptr; Properties.count = Metadata.size(); Properties.pMetadatas = Metadata.data(); - Adapter->call( - Context->getHandleRef(), UrDevice, DataLen, Data, &Properties, &Program); + + assert(Devices.size() > 0 && "No devices provided for program creation"); + ur_result_t Error = + Adapter->call_nocheck( + Context->getHandleRef(), DeviceHandles.size(), DeviceHandles.data(), + Lengths, Binaries, &Properties, &Program); + if (Error == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + Adapter->call( + Context->getHandleRef(), DeviceHandles[0], Lengths[0], Binaries[0], + &Properties, &Program); + } if (BinaryStatus != UR_RESULT_SUCCESS) { throw detail::set_ur_error( @@ -178,13 +181,28 @@ static bool isDeviceBinaryTypeSupported(const context &C, return "unknown"; } +[[maybe_unused]] auto VecToString = [](auto &Vec) -> std::string { + std::ostringstream Out; + Out << "{"; + for (auto Elem : Vec) + Out << Elem << " "; + Out << "}"; + return Out.str(); +}; + ur_program_handle_t ProgramManager::createURProgram(const RTDeviceBinaryImage &Img, - const context &Context, const device &Device) { - if constexpr (DbgProgMgr > 0) + const context &Context, + const std::vector &Devices) { + if constexpr (DbgProgMgr > 0) { + std::vector URDevices; + std::transform( + Devices.begin(), Devices.end(), std::back_inserter(URDevices), + [](const device &Dev) { return getSyclObjImpl(Dev)->getHandleRef(); }); std::cerr << ">>> ProgramManager::createPIProgram(" << &Img << ", " - << getSyclObjImpl(Context).get() << ", " - << getSyclObjImpl(Device).get() << ")\n"; + << getSyclObjImpl(Context).get() << ", " << VecToString(URDevices) + << ")\n"; + } const sycl_device_binary_struct &RawImg = Img.getRawData(); // perform minimal sanity checks on the device image and the descriptor @@ -221,10 +239,13 @@ ProgramManager::createURProgram(const RTDeviceBinaryImage &Img, // Load the image const ContextImplPtr Ctx = getSyclObjImpl(Context); + std::vector Binaries( + Devices.size(), const_cast(RawImg.BinaryStart)); + std::vector Lengths(Devices.size(), ImgSize); ur_program_handle_t Res = Format == SYCL_DEVICE_BINARY_TYPE_SPIRV ? createSpirvProgram(Ctx, RawImg.BinaryStart, ImgSize) - : createBinaryProgram(Ctx, Device, RawImg.BinaryStart, ImgSize, + : createBinaryProgram(Ctx, Devices, Binaries.data(), Lengths.data(), ProgMetadata); { @@ -233,7 +254,7 @@ ProgramManager::createURProgram(const RTDeviceBinaryImage &Img, NativePrograms.insert({Res, &Img}); } - Ctx->addDeviceGlobalInitializer(Res, {Device}, &Img); + Ctx->addDeviceGlobalInitializer(Res, Devices, &Img); if constexpr (DbgProgMgr > 1) std::cerr << "created program: " << Res @@ -491,12 +512,12 @@ static void applyOptionsFromEnvironment(std::string &CompileOpts, std::pair ProgramManager::getOrCreateURProgram( const RTDeviceBinaryImage &MainImg, const std::vector &AllImages, - const context &Context, const device &Device, + const context &Context, const std::vector &Devices, const std::string &CompileAndLinkOptions, SerializedObj SpecConsts) { ur_program_handle_t NativePrg; // TODO: Or native? auto BinProg = PersistentDeviceCodeCache::getItemFromDisc( - Device, AllImages, SpecConsts, CompileAndLinkOptions); + Devices[0], AllImages, SpecConsts, CompileAndLinkOptions); if (BinProg.size()) { // Get program metadata from properties std::vector ProgMetadataVector; @@ -505,12 +526,14 @@ std::pair ProgramManager::getOrCreateURProgram( ProgMetadataVector.insert(ProgMetadataVector.end(), ImgProgMetadata.begin(), ImgProgMetadata.end()); } - // TODO: Build for multiple devices once supported by program manager - NativePrg = createBinaryProgram(getSyclObjImpl(Context), Device, - (const unsigned char *)BinProg[0].data(), - BinProg[0].size(), ProgMetadataVector); + std::vector Binaries(Devices.size(), + (const uint8_t *)BinProg[0].data()); + std::vector Lengths(Devices.size(), BinProg[0].size()); + NativePrg = + createBinaryProgram(getSyclObjImpl(Context), Devices, Binaries.data(), + Lengths.data(), ProgMetadataVector); } else { - NativePrg = createURProgram(MainImg, Context, Device); + NativePrg = createURProgram(MainImg, Context, Devices); } return {NativePrg, BinProg.size()}; } @@ -797,7 +820,7 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( appendCompileEnvironmentVariablesThatAppend(CompileOpts); appendLinkEnvironmentVariablesThatAppend(LinkOpts); auto [NativePrg, DeviceCodeWasInCache] = getOrCreateURProgram( - Img, AllImages, Context, Device, CompileOpts + LinkOpts, SpecConsts); + Img, AllImages, Context, {Device}, CompileOpts + LinkOpts, SpecConsts); if (!DeviceCodeWasInCache) { if (Img.supportsSpecConstants()) @@ -835,7 +858,7 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( DeviceImageImpl->get_spec_const_blob_ref(); ur_program_handle_t NativePrg = - createURProgram(*BinImg, Context, Device); + createURProgram(*BinImg, Context, {Device}); if (BinImg->supportsSpecConstants()) setSpecializationConstants(DeviceImageImpl, NativePrg, Adapter); @@ -843,11 +866,13 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( ProgramsToLink.push_back(NativePrg); } } - ProgramPtr BuiltProgram = - build(std::move(ProgramManaged), ContextImpl, CompileOpts, LinkOpts, - getSyclObjImpl(Device).get()->getHandleRef(), DeviceLibReqMask, - ProgramsToLink, /*CreatedFromBinary*/ Img.getFormat() != - SYCL_DEVICE_BINARY_TYPE_SPIRV); + std::vector Devs = { + getSyclObjImpl(Device).get()->getHandleRef()}; + ; + ProgramPtr BuiltProgram = build( + std::move(ProgramManaged), ContextImpl, CompileOpts, LinkOpts, Devs, + DeviceLibReqMask, ProgramsToLink, + /*CreatedFromBinary*/ Img.getFormat() != SYCL_DEVICE_BINARY_TYPE_SPIRV); // Those extra programs won't be used anymore, just the final linked result for (ur_program_handle_t Prg : ProgramsToLink) Adapter->call(Prg); @@ -875,8 +900,8 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( uint32_t ImgId = Img.getImageID(); const ur_device_handle_t UrDevice = Dev->getHandleRef(); - auto CacheKey = - std::make_pair(std::make_pair(std::move(SpecConsts), ImgId), UrDevice); + auto CacheKey = std::make_pair(std::make_pair(std::move(SpecConsts), ImgId), + std::set{UrDevice}); auto GetCachedBuildF = [&Cache, &CacheKey]() { return Cache.getOrInsertProgram(CacheKey); @@ -1167,46 +1192,86 @@ static ur_result_t doCompile(const AdapterPtr &Adapter, return Result; } -static ur_program_handle_t loadDeviceLibFallback(const ContextImplPtr Context, - DeviceLibExt Extension, - ur_device_handle_t Device, - bool UseNativeLib) { +static ur_program_handle_t +loadDeviceLibFallback(const ContextImplPtr Context, DeviceLibExt Extension, + std::vector &Devices, + bool UseNativeLib) { auto LibFileName = getDeviceLibFilename(Extension, UseNativeLib); - auto LockedCache = Context->acquireCachedLibPrograms(); - auto CachedLibPrograms = LockedCache.get(); - auto CacheResult = CachedLibPrograms.emplace( - std::make_pair(std::make_pair(Extension, Device), nullptr)); - bool Cached = !CacheResult.second; - auto LibProgIt = CacheResult.first; - ur_program_handle_t &LibProg = LibProgIt->second; - - if (Cached) - return LibProg; - - if (!loadDeviceLib(Context, LibFileName, LibProg)) { - CachedLibPrograms.erase(LibProgIt); + auto &CachedLibPrograms = LockedCache.get(); + // Collect list of devices to compile the library for. Library was already + // compiled for a device if there is a corresponding record in the per-context + // cache. + std::vector DevicesToCompile; + ur_program_handle_t URProgram = nullptr; + assert(Devices.size() > 0 && + "At least one device is expected in the input vector"); + // Vector of devices that don't have the library cached. + for (auto Dev : Devices) { + auto CacheResult = CachedLibPrograms.emplace( + std::make_pair(std::make_pair(Extension, Dev), nullptr)); + auto Cached = !CacheResult.second; + if (!Cached) { + DevicesToCompile.push_back(Dev); + } else { + auto CachedURProgram = CacheResult.first->second; + assert(CachedURProgram && "If device lib UR program was cached then is " + "expected to be not a nullptr"); + assert(((URProgram && URProgram == CachedURProgram) || (!URProgram)) && + "All cached UR programs should be the same"); + if (!URProgram) + URProgram = CachedURProgram; + } + } + if (DevicesToCompile.empty()) + return URProgram; + + auto EraseProgramForDevices = [&]() { + for (auto Dev : DevicesToCompile) + CachedLibPrograms.erase(std::make_pair(Extension, Dev)); + }; + bool IsProgramCreated = !URProgram; + + // Create UR program for device lib if we don't have it yet. + if (!URProgram && !loadDeviceLib(Context, LibFileName, URProgram)) { + EraseProgramForDevices(); throw exception(make_error_code(errc::build), std::string("Failed to load ") + LibFileName); } + // Insert URProgram into the cache for all devices that we compiled it for. + // Retain UR program for each record in the cache. const AdapterPtr &Adapter = Context->getAdapter(); + + // UR program handle is stored in the cache for each device that we compiled + // it for. We have to retain UR program for each record in the cache. We need + // to take into account that UR program creation makes its reference count to + // be 1. + size_t RetainCount = + IsProgramCreated ? DevicesToCompile.size() - 1 : DevicesToCompile.size(); + for (size_t I = 0; I < RetainCount; ++I) + Adapter->call(URProgram); + + for (auto Dev : DevicesToCompile) + CachedLibPrograms[std::make_pair(Extension, Dev)] = URProgram; + // TODO no spec constants are used in the std libraries, support in the future // Do not use compile options for library programs: it is not clear if user // options (image options) are supposed to be applied to library program as // well, and what actually happens to a SPIR-V program if we apply them. ur_result_t Error = - doCompile(Adapter, LibProg, 1, &Device, Context->getHandleRef(), ""); + doCompile(Adapter, URProgram, DevicesToCompile.size(), + DevicesToCompile.data(), Context->getHandleRef(), ""); if (Error != UR_RESULT_SUCCESS) { - CachedLibPrograms.erase(LibProgIt); + EraseProgramForDevices(); throw detail::set_ur_error( exception(make_error_code(errc::build), - ProgramManager::getProgramBuildLog(LibProg, Context)), + ProgramManager::getProgramBuildLog(URProgram, Context)), Error); } - return LibProg; + return URProgram; } ProgramManager::ProgramManager() : m_AsanFoundInImage(false) { @@ -1459,7 +1524,7 @@ static bool isDeviceLibRequired(DeviceLibExt Ext, uint32_t DeviceLibReqMask) { static std::vector getDeviceLibPrograms(const ContextImplPtr Context, - const ur_device_handle_t &Device, + std::vector &Devices, uint32_t DeviceLibReqMask) { std::vector Programs; @@ -1478,50 +1543,65 @@ getDeviceLibPrograms(const ContextImplPtr Context, // Disable all devicelib extensions requiring fp64 support if at least // one underlying device doesn't support cl_khr_fp64. - std::string DevExtList = - Context->getPlatformImpl()->getDeviceImpl(Device)->get_device_info_string( - UrInfoCode::value); - const bool fp64Support = (DevExtList.npos != DevExtList.find("cl_khr_fp64")); + const bool fp64Support = std::all_of( + Devices.begin(), Devices.end(), [&Context](ur_device_handle_t Device) { + std::string DevExtList = + Context->getPlatformImpl() + ->getDeviceImpl(Device) + ->get_device_info_string( + UrInfoCode::value); + return (DevExtList.npos != DevExtList.find("cl_khr_fp64")); + }); - // Load a fallback library for an extension if the device does not + // Load a fallback library for an extension if the any device does not // support it. - for (auto &Pair : RequiredDeviceLibExt) { - DeviceLibExt Ext = Pair.first; - bool &FallbackIsLoaded = Pair.second; - - if (FallbackIsLoaded) { - continue; - } + for (auto Device : Devices) { + std::string DevExtList = + Context->getPlatformImpl() + ->getDeviceImpl(Device) + ->get_device_info_string( + UrInfoCode::value); + + for (auto &Pair : RequiredDeviceLibExt) { + DeviceLibExt Ext = Pair.first; + bool &FallbackIsLoaded = Pair.second; + + if (FallbackIsLoaded) { + continue; + } - if (!isDeviceLibRequired(Ext, DeviceLibReqMask)) { - continue; - } + if (!isDeviceLibRequired(Ext, DeviceLibReqMask)) { + continue; + } - if ((Ext == DeviceLibExt::cl_intel_devicelib_math_fp64 || - Ext == DeviceLibExt::cl_intel_devicelib_complex_fp64 || - Ext == DeviceLibExt::cl_intel_devicelib_imf_fp64) && - !fp64Support) { - continue; - } + // Skip loading the fallback library that requires fp64 support if any + // device in the list doesn't support fp64. + if ((Ext == DeviceLibExt::cl_intel_devicelib_math_fp64 || + Ext == DeviceLibExt::cl_intel_devicelib_complex_fp64 || + Ext == DeviceLibExt::cl_intel_devicelib_imf_fp64) && + !fp64Support) { + continue; + } - auto ExtName = getDeviceLibExtensionStr(Ext); + auto ExtName = getDeviceLibExtensionStr(Ext); - bool InhibitNativeImpl = false; - if (const char *Env = getenv("SYCL_DEVICELIB_INHIBIT_NATIVE")) { - InhibitNativeImpl = strstr(Env, ExtName) != nullptr; - } + bool InhibitNativeImpl = false; + if (const char *Env = getenv("SYCL_DEVICELIB_INHIBIT_NATIVE")) { + InhibitNativeImpl = strstr(Env, ExtName) != nullptr; + } - bool DeviceSupports = DevExtList.npos != DevExtList.find(ExtName); - if (!DeviceSupports || InhibitNativeImpl) { - Programs.push_back( - loadDeviceLibFallback(Context, Ext, Device, /*UseNativeLib=*/false)); - FallbackIsLoaded = true; - } else { - // bfloat16 needs native library if device supports it - if (Ext == DeviceLibExt::cl_intel_devicelib_bfloat16) { - Programs.push_back( - loadDeviceLibFallback(Context, Ext, Device, /*UseNativeLib=*/true)); + bool DeviceSupports = DevExtList.npos != DevExtList.find(ExtName); + if (!DeviceSupports || InhibitNativeImpl) { + Programs.push_back(loadDeviceLibFallback(Context, Ext, Devices, + /*UseNativeLib=*/false)); FallbackIsLoaded = true; + } else { + // bfloat16 needs native library if device supports it + if (Ext == DeviceLibExt::cl_intel_devicelib_bfloat16) { + Programs.push_back(loadDeviceLibFallback(Context, Ext, Devices, + /*UseNativeLib=*/true)); + FallbackIsLoaded = true; + } } } } @@ -1531,14 +1611,16 @@ getDeviceLibPrograms(const ContextImplPtr Context, ProgramManager::ProgramPtr ProgramManager::build( ProgramPtr Program, const ContextImplPtr Context, const std::string &CompileOptions, const std::string &LinkOptions, - ur_device_handle_t Device, uint32_t DeviceLibReqMask, + std::vector &Devices, uint32_t DeviceLibReqMask, const std::vector &ExtraProgramsToLink, bool CreatedFromBinary) { if constexpr (DbgProgMgr > 0) { std::cerr << ">>> ProgramManager::build(" << Program.get() << ", " - << CompileOptions << ", " << LinkOptions << ", ... " << Device - << ")\n"; + << CompileOptions << ", " << LinkOptions << ", " + << VecToString(Devices) << ", " << std::hex << DeviceLibReqMask + << std::dec << ", " << VecToString(ExtraProgramsToLink) << ", " + << CreatedFromBinary << ")\n"; } bool LinkDeviceLibs = (DeviceLibReqMask != 0); @@ -1552,7 +1634,7 @@ ProgramManager::ProgramPtr ProgramManager::build( std::vector LinkPrograms; if (LinkDeviceLibs) { - LinkPrograms = getDeviceLibPrograms(Context, Device, DeviceLibReqMask); + LinkPrograms = getDeviceLibPrograms(Context, Devices, DeviceLibReqMask); } static const char *ForceLinkEnv = std::getenv("SYCL_FORCE_LINK"); @@ -1564,8 +1646,7 @@ ProgramManager::ProgramPtr ProgramManager::build( ? CompileOptions : (CompileOptions + " " + LinkOptions); ur_result_t Error = Adapter->call_nocheck( - Program.get(), - /*num devices =*/1, &Device, Options.c_str()); + Program.get(), Devices.size(), Devices.data(), Options.c_str()); if (Error == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { Error = Adapter->call_nocheck( Context->getHandleRef(), Program.get(), Options.c_str()); @@ -1582,7 +1663,7 @@ ProgramManager::ProgramPtr ProgramManager::build( // Include the main program and compile/link everything together if (!CreatedFromBinary) { - auto Res = doCompile(Adapter, Program.get(), /*num devices =*/1, &Device, + auto Res = doCompile(Adapter, Program.get(), Devices.size(), Devices.data(), Context->getHandleRef(), CompileOptions.c_str()); Adapter->checkUrResult(Res); } @@ -1590,7 +1671,7 @@ ProgramManager::ProgramPtr ProgramManager::build( for (ur_program_handle_t Prg : ExtraProgramsToLink) { if (!CreatedFromBinary) { - auto Res = doCompile(Adapter, Prg, /*num devices =*/1, &Device, + auto Res = doCompile(Adapter, Prg, Devices.size(), Devices.data(), Context->getHandleRef(), CompileOptions.c_str()); Adapter->checkUrResult(Res); } @@ -1600,9 +1681,9 @@ ProgramManager::ProgramPtr ProgramManager::build( ur_program_handle_t LinkedProg = nullptr; auto doLink = [&] { auto Res = Adapter->call_nocheck( - Context->getHandleRef(), - /*num devices =*/1, &Device, LinkPrograms.size(), LinkPrograms.data(), - LinkOptions.c_str(), &LinkedProg); + Context->getHandleRef(), Devices.size(), Devices.data(), + LinkPrograms.size(), LinkPrograms.data(), LinkOptions.c_str(), + &LinkedProg); if (Res == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { Res = Adapter->call_nocheck( Context->getHandleRef(), LinkPrograms.size(), LinkPrograms.data(), @@ -2342,21 +2423,10 @@ ProgramManager::compile(const device_image_plain &DeviceImage, const AdapterPtr &Adapter = getSyclObjImpl(InputImpl->get_context())->getAdapter(); - // TODO: Add support for creating non-SPIRV programs from multiple devices. - if (InputImpl->get_bin_image_ref()->getFormat() != - SYCL_DEVICE_BINARY_TYPE_SPIRV && - Devs.size() > 1) - // FIXME: It was probably intended to be thrown, but a unittest starts - // failing if we do so, investigate independently of switching to SYCL 2020 - // `exception`. - exception(make_error_code(errc::feature_not_supported), - "Creating a program from AOT binary for multiple device is " - "not supported"); - // Device is not used when creating program from SPIRV, so passing only one // device is OK. ur_program_handle_t Prog = createURProgram(*InputImpl->get_bin_image_ref(), - InputImpl->get_context(), Devs[0]); + InputImpl->get_context(), Devs); if (InputImpl->get_bin_image_ref()->supportsSpecConstants()) setSpecializationConstants(InputImpl, Prog, Adapter); @@ -2542,21 +2612,11 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, // Should always come last! appendCompileEnvironmentVariablesThatAppend(CompileOpts); appendLinkEnvironmentVariablesThatAppend(LinkOpts); - // TODO: Add support for creating non-SPIRV programs from multiple devices. - if (InputImpl->get_bin_image_ref()->getFormat() != - SYCL_DEVICE_BINARY_TYPE_SPIRV && - Devs.size() > 1) - // FIXME: It was probably intended to be thrown, but a unittest starts - // failing if we do so, investigate independently of switching to SYCL - // 2020 `exception`. - exception(make_error_code(errc::feature_not_supported), - "Creating a program from AOT binary for multiple device " - "is not supported"); // Device is not used when creating program from SPIRV, so passing only one // device is OK. auto [NativePrg, DeviceCodeWasInCache] = getOrCreateURProgram( - Img, {&Img}, Context, Devs[0], CompileOpts + LinkOpts, SpecConsts); + Img, {&Img}, Context, Devs, CompileOpts + LinkOpts, SpecConsts); if (!DeviceCodeWasInCache && InputImpl->get_bin_image_ref()->supportsSpecConstants()) @@ -2579,10 +2639,13 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, // TODO: Add support for dynamic linking with kernel bundles std::vector ExtraProgramsToLink; + std::vector URDevices; + for (auto Dev : Devs) { + URDevices.push_back(getSyclObjImpl(Dev).get()->getHandleRef()); + } ProgramPtr BuiltProgram = build(std::move(ProgramManaged), ContextImpl, CompileOpts, LinkOpts, - getSyclObjImpl(Devs[0]).get()->getHandleRef(), DeviceLibReqMask, - ExtraProgramsToLink); + URDevices, DeviceLibReqMask, ExtraProgramsToLink); emitBuiltProgramInfo(BuiltProgram.get(), ContextImpl); @@ -2614,9 +2677,14 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, } uint32_t ImgId = Img.getImageID(); - ur_device_handle_t UrDevice = getSyclObjImpl(Devs[0]).get()->getHandleRef(); - auto CacheKey = - std::make_pair(std::make_pair(std::move(SpecConsts), ImgId), UrDevice); + std::set URDevicesSet; + std::transform(Devs.begin(), Devs.end(), + std::inserter(URDevicesSet, URDevicesSet.begin()), + [](const device &Dev) { + return getSyclObjImpl(Dev).get()->getHandleRef(); + }); + auto CacheKey = std::make_pair(std::make_pair(std::move(SpecConsts), ImgId), + URDevicesSet); // CacheKey is captured by reference so when we overwrite it later we can // reuse this function. @@ -2630,26 +2698,33 @@ device_image_plain ProgramManager::build(const device_image_plain &DeviceImage, ur_program_handle_t ResProgram = BuildResult->Val; - // Cache supports key with once device only, but here we have multiple - // devices a program is built for, so add the program to the cache for all - // other devices. + // Here we have multiple devices a program is built for, so add the program to + // the cache for all subsets of provided list of devices. const AdapterPtr &Adapter = ContextImpl->getAdapter(); - auto CacheOtherDevices = [ResProgram, &Adapter]() { + auto CacheSubsets = [ResProgram, &Adapter]() { Adapter->call(ResProgram); return ResProgram; }; - // The program for device "0" is already added to the cache during the first - // call to getOrBuild, so starting with "1" - for (size_t Idx = 1; Idx < Devs.size(); ++Idx) { - const ur_device_handle_t UrDeviceAdd = - getSyclObjImpl(Devs[Idx]).get()->getHandleRef(); - - // Change device in the cache key to reduce copying of spec const data. - CacheKey.second = UrDeviceAdd; - Cache.getOrBuild(GetCachedBuildF, CacheOtherDevices); - // getOrBuild is not supposed to return nullptr - assert(BuildResult != nullptr && "Invalid build result"); + if (URDevicesSet.size() > 1) { + // emplace all subsets of the current set of devices into the cache. + // Set of all devices is not included in the loop as it was already added + // into the cache. + for (int Mask = 1; Mask < (1 << URDevicesSet.size()) - 1; ++Mask) { + std::set Subset; + int Index = 0; + for (auto It = URDevicesSet.begin(); It != URDevicesSet.end(); + ++It, ++Index) { + if (Mask & (1 << Index)) { + Subset.insert(*It); + } + } + // Change device in the cache key to reduce copying of spec const data. + CacheKey.second = Subset; + Cache.getOrBuild(GetCachedBuildF, CacheSubsets); + // getOrBuild is not supposed to return nullptr + assert(BuildResult != nullptr && "Invalid build result"); + } } // devive_image_impl shares ownership of PIProgram with, at least, program @@ -2773,7 +2848,7 @@ ur_kernel_handle_t ProgramManager::getOrCreateMaterializedKernel( if constexpr (DbgProgMgr > 0) std::cerr << ">>> Adding the kernel to the cache.\n"; - auto Program = createURProgram(Img, Context, Device); + auto Program = createURProgram(Img, Context, {Device}); auto DeviceImpl = detail::getSyclObjImpl(Device); auto &Adapter = DeviceImpl->getAdapter(); UrFuncInfo programReleaseInfo; @@ -2786,9 +2861,10 @@ ur_kernel_handle_t ProgramManager::getOrCreateMaterializedKernel( applyOptionsFromEnvironment(CompileOpts, LinkOpts); // No linking of extra programs reqruired. std::vector ExtraProgramsToLink; + std::vector Devs = {DeviceImpl->getHandleRef()}; auto BuildProgram = build(std::move(ProgramManaged), detail::getSyclObjImpl(Context), - CompileOpts, LinkOpts, DeviceImpl->getHandleRef(), + CompileOpts, LinkOpts, Devs, /*For non SPIR-V devices DeviceLibReqdMask is always 0*/ 0, ExtraProgramsToLink); ur_kernel_handle_t UrKernel{nullptr}; diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index 56db9cf8d1fec..8ab12229fc6c5 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -104,7 +104,7 @@ class ProgramManager { ur_program_handle_t createURProgram(const RTDeviceBinaryImage &Img, const context &Context, - const device &Device); + const std::vector &Devices); /// Creates a UR program using either a cached device code binary if present /// in the persistent cache or from the supplied device image otherwise. /// \param Img The device image used to create the program. @@ -127,7 +127,7 @@ class ProgramManager { std::pair getOrCreateURProgram( const RTDeviceBinaryImage &Img, const std::vector &AllImages, - const context &Context, const device &Device, + const context &Context, const std::vector &Devices, const std::string &CompileAndLinkOptions, SerializedObj SpecConsts); /// Builds or retrieves from cache a program defining the kernel with given /// name. @@ -302,7 +302,8 @@ class ProgramManager { decltype(&::urProgramRelease)>; ProgramPtr build(ProgramPtr Program, const ContextImplPtr Context, const std::string &CompileOptions, - const std::string &LinkOptions, ur_device_handle_t Device, + const std::string &LinkOptions, + std::vector &Devices, uint32_t DeviceLibReqMask, const std::vector &ProgramsToLink, bool CreatedFromBinary = false); diff --git a/sycl/test-e2e/KernelAndProgram/cache_env_vars.cpp b/sycl/test-e2e/KernelAndProgram/cache_env_vars.cpp index 852ec3ed9df54..7c17fce6ab81c 100644 --- a/sycl/test-e2e/KernelAndProgram/cache_env_vars.cpp +++ b/sycl/test-e2e/KernelAndProgram/cache_env_vars.cpp @@ -25,12 +25,12 @@ // Some backends will call urProgramBuild and some will call // urProgramBuildExp depending on urProgramBuildExp support. -// CHECK-BUILD-NOT: urProgramCreateWithBinary( +// CHECK-BUILD-NOT: urProgramCreateWithBinary{{(Exp)?}}( // CHECK-BUILD: urProgramCreateWithIL( // CHECK-BUILD: urProgramBuild{{(Exp)?}}( // CHECK-CACHE-NOT: urProgramCreateWithIL( -// CHECK-CACHE: urProgramCreateWithBinary( +// CHECK-CACHE: urProgramCreateWithBinaryExp( // CHECK-CACHE: urProgramBuild{{(Exp)?}}( #include "cache_env_vars.hpp" diff --git a/sycl/test-e2e/KernelAndProgram/cache_env_vars_lin.cpp b/sycl/test-e2e/KernelAndProgram/cache_env_vars_lin.cpp index e43ed543ba5d2..a5f6660eca578 100644 --- a/sycl/test-e2e/KernelAndProgram/cache_env_vars_lin.cpp +++ b/sycl/test-e2e/KernelAndProgram/cache_env_vars_lin.cpp @@ -24,12 +24,12 @@ // Some backends will call urProgramBuild and some will call urProgramBuildExp depending on urProgramBuildExp support. -// CHECK-BUILD-NOT: urProgramCreateWithBinary( +// CHECK-BUILD-NOT: urProgramCreateWithBinary{{(Exp)?}}( // CHECK-BUILD: urProgramCreateWithIL( // CHECK-BUILD: urProgramBuild{{(Exp)?}}( // CHECK-CACHE-NOT: urProgramCreateWithIL( -// CHECK-CACHE: urProgramCreateWithBinary( +// CHECK-CACHE: urProgramCreateWithBinary{{(Exp)?}}( // CHECK-CACHE: urProgramBuild{{(Exp)?}}( #include "cache_env_vars.hpp" diff --git a/sycl/test-e2e/ProgramManager/multi_device_bundle/build_twice.cpp b/sycl/test-e2e/ProgramManager/multi_device_bundle/build_twice.cpp new file mode 100644 index 0000000000000..46b6e169e40c6 --- /dev/null +++ b/sycl/test-e2e/ProgramManager/multi_device_bundle/build_twice.cpp @@ -0,0 +1,52 @@ +// REQUIRES: level_zero && gpu + +// Test to check that we can create input kernel bundle and call build twice for +// overlapping set of devices and execute the kernel on each device. + +// RUN: %{build} -o %t.out +// RUN: env NEOReadDebugKeys=1 CreateMultipleRootDevices=3 SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s + +#include + +class Kernel; + +int main() { + sycl::platform platform; + auto devices = platform.get_devices(); + assert(devices.size() >= 3); + auto dev1 = devices[0], dev2 = devices[1], dev3 = devices[2]; + + auto ctx = sycl::context({dev1, dev2, dev3}); + sycl::queue queues[3] = {sycl::queue(ctx, dev1), sycl::queue(ctx, dev2), + sycl::queue(ctx, dev3)}; + sycl::kernel_id kid = sycl::get_kernel_id(); + sycl::kernel_bundle kernelBundleInput = + sycl::get_kernel_bundle(ctx, {kid}); + // CHECK: urProgramCreateWithIL + // CHECK: urProgramBuildExp + auto KernelBundleExe1 = build(kernelBundleInput, {dev1, dev2}); + // CHECK: urProgramCreateWithIL + // CHECK: urProgramBuildExp + auto KernelBundleExe2 = build(kernelBundleInput, {dev2, dev3}); + // No other program creation calls are expected. + // CHECK-NOT: urProgramCreateWithIL + auto KernelObj1 = KernelBundleExe1.get_kernel(kid); + auto KernelObj2 = KernelBundleExe2.get_kernel(kid); + queues[0].submit([=](sycl::handler &cgh) { + cgh.use_kernel_bundle(KernelBundleExe1); + cgh.single_task([=]() {}); + }); + queues[1].submit([=](sycl::handler &cgh) { + cgh.use_kernel_bundle(KernelBundleExe1); + cgh.single_task(KernelObj1); + }); + queues[1].submit([=](sycl::handler &cgh) { + cgh.use_kernel_bundle(KernelBundleExe2); + cgh.single_task(KernelObj2); + }); + queues[2].submit([=](sycl::handler &cgh) { + cgh.use_kernel_bundle(KernelBundleExe2); + cgh.single_task(KernelObj2); + }); + return 0; +} diff --git a/sycl/test-e2e/ProgramManager/multi_device_bundle/compile_link.cpp b/sycl/test-e2e/ProgramManager/multi_device_bundle/compile_link.cpp new file mode 100644 index 0000000000000..a7b576649ca11 --- /dev/null +++ b/sycl/test-e2e/ProgramManager/multi_device_bundle/compile_link.cpp @@ -0,0 +1,34 @@ +// REQUIRES: level_zero && gpu + +// RUN: %{build} -o %t.out +// RUN: env NEOReadDebugKeys=1 CreateMultipleRootDevices=3 SYCL_UR_TRACE=2 %{run} %t.out + +// Test to check that we can compile and link a kernel bundle for multiple +// devices and run the kernel on each device. +#include + +class Kernel; + +int main() { + sycl::platform platform; + auto devices = platform.get_devices(); + assert(devices.size() >= 3); + auto dev1 = devices[0], dev2 = devices[1], dev3 = devices[2]; + + auto ctx = sycl::context({dev1, dev2, dev3}); + sycl::queue queues[3] = {sycl::queue(ctx, dev1), sycl::queue(ctx, dev2), + sycl::queue(ctx, dev3)}; + sycl::kernel_id kid = sycl::get_kernel_id(); + sycl::kernel_bundle kernelBundleInput = + sycl::get_kernel_bundle(ctx, {kid}); + auto KernelBundleCompiled = compile(kernelBundleInput, {dev1, dev2, dev3}); + auto KernelBundleLinked = link(KernelBundleCompiled, {dev1, dev2, dev3}); + for (int i = 0; i < 3; i++) { + queues[i].submit([=](sycl::handler &cgh) { + cgh.use_kernel_bundle(KernelBundleLinked); + cgh.single_task([=]() {}); + }); + queues[i].wait(); + } + return 0; +} diff --git a/sycl/test-e2e/ProgramManager/multi_device_bundle/device_libs_and_caching.cpp b/sycl/test-e2e/ProgramManager/multi_device_bundle/device_libs_and_caching.cpp new file mode 100644 index 0000000000000..027058064c6b6 --- /dev/null +++ b/sycl/test-e2e/ProgramManager/multi_device_bundle/device_libs_and_caching.cpp @@ -0,0 +1,159 @@ +// Test checks urProgramCreateWithBinaryExp functionality which is supported +// only Level Zero backend at the moment. +// REQUIRES: level_zero && gpu + +// Test to check several use cases for multi-device kernel bundles. +// Test covers AOT and JIT cases. Kernel is using some math functions to enforce +// using device libraries to excersise additional logic in the program manager. +// Checks are used to test that program and device libraries caching works as +// expected. + +// Test JIT first. +// Intentionally use jit linking of device libraries to check that program +// manager can handle this as well. With this option program manager will +// compile the main program, load and compile device libraries and then link +// everything together. +// RUN: %{build} -fsycl-device-lib-jit-link -o %t.out + +// Check the default case when in-memory caching is enabled. +// RUN: env NEOReadDebugKeys=1 CreateMultipleRootDevices=4 SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s --check-prefixes=CHECK-SPIRV-JIT-LINK-TRACE + +// Check the case when in-memory caching of the programs is disabled. +// RUNL env SYCL_CACHE_IN_MEM=0 NEOReadDebugKeys=1 CreateMultipleRootDevices=4 +// %{run} %t.out + +// Test AOT next. +// RUN: %{build} -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device *" -o %t.out + +// Check the default case when in-memory caching is enabled. +// RUN: env NEOReadDebugKeys=1 CreateMultipleRootDevices=4 SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s --check-prefixes=CHECK-AOT-TRACE + +// Check the case when in-memory caching of the programs is disabled. +// RUNL env SYCL_CACHE_IN_MEM=0 NEOReadDebugKeys=1 CreateMultipleRootDevices=4 +// %{run} %t.out + +#include +#include +#include +#include +#include + +class Kernel; +class Kernel2; +class Kernel3; + +int main() { + sycl::platform platform; + auto devices = platform.get_devices(); + assert(devices.size() >= 4); + auto dev1 = devices[0], dev2 = devices[1], dev3 = devices[2], + dev4 = devices[3]; + auto ctx = sycl::context({dev1, dev2, dev3, dev4}); + sycl::queue queues[4] = {sycl::queue(ctx, dev1), sycl::queue(ctx, dev2), + sycl::queue(ctx, dev3), sycl::queue(ctx, dev4)}; + + auto res = sycl::malloc_host(3, ctx); + auto KernelLambda = [=]() { + res[0] = sycl::ext::intel::math::float2int_rd(4.0) + (int)sqrtf(4.0f) + + std::exp(std::complex(0.f, 0.f)).real(); + }; + // Test case 1 + // Get bundle in executable state for multiple devices in a context, enqueue a + // kernel to each device. + { + sycl::kernel_id kid = sycl::get_kernel_id(); + // Create the main program containing the kernel. + // CHECK-SPIRV-JIT-LINK-TRACE: urProgramCreateWithIL + + // Create and compile the program for required device libraries (2 of them + // in this case). + // CHECK-SPIRV-JIT-LINK-TRACE: urProgramCreateWithIL + // CHECK-SPIRV-JIT-LINK-TRACE: urProgramCompileExp + // CHECK-SPIRV-JIT-LINK-TRACE: urProgramCreateWithIL + // CHECK-SPIRV-JIT-LINK-TRACE: urProgramCompileExp + + // Compile the main program + // CHECK-SPIRV-JIT-LINK-TRACE: urProgramCompileExp + + // Link main program and device libraries. + // CHECK-SPIRV-JIT-LINK-TRACE: urProgramLinkExp + + // CHECK-AOT-TRACE: urProgramCreateWithBinaryExp + // CHECK-AOT-TRACE: urProgramBuildExp + sycl::kernel_bundle kernelBundleExecutable = + sycl::get_kernel_bundle( + ctx, {dev1, dev2, dev3}, {kid}); + + for (int i = 0; i < 3; i++) { + queues[i].submit([=](sycl::handler &cgh) { + cgh.use_kernel_bundle(kernelBundleExecutable); + cgh.single_task(KernelLambda); + }); + queues[i].wait(); + } + std::cout << "Test #1 passed." << std::endl; + } + + // Test case 2 + // Get two bundles in executable state: for the first two devices in the + // context and for the new set of devices which includes the dev4. This checks + // caching of the programs and device libraries. + { + sycl::kernel_id kid = sycl::get_kernel_id(); + // Program associated with {dev1, dev2, dev3} is supposed to be cached from + // the first test case, we don't expect any additional program creation and + // compilation calls for the following bundles because they are all created + // for subsets of {dev1, dev2, dev3} which means that the program handle + // from cache will be used. + sycl::kernel_bundle kernelBundleExecutableSubset1 = + sycl::get_kernel_bundle( + ctx, {dev1, dev2}, {kid}); + sycl::kernel_bundle kernelBundleExecutableSubset2 = + sycl::get_kernel_bundle( + ctx, {dev2, dev3}, {kid}); + sycl::kernel_bundle kernelBundleExecutableSubset3 = + sycl::get_kernel_bundle( + ctx, {dev1, dev3}, {kid}); + sycl::kernel_bundle kernelBundleExecutableSubset4 = + sycl::get_kernel_bundle(ctx, {dev3}, + {kid}); + + // Here we create a bundle with a different set of devices which includes + // dev4, so we expect new UR program creation. + // CHECK-SPIRV-JIT-LINK-TRACE: urProgramCreateWithIL + + // Device libraries will be additionally compiled for dev4, but no program + // creation is expected for device libraries as program handle already + // exists in the per-context cache. + // CHECK-SPIRV-JIT-LINK-TRACE-NOT: urProgramCreateWithIL + // CHECK-SPIRV-JIT-LINK-TRACE: urProgramCompileExp + + // Main program will be compiled for new set of devices. + // CHECK-SPIRV-JIT-LINK-TRACE: urProgramCompileExp + + // Main program will be linked with device libraries. + // CHECK-SPIRV-JIT-LINK-TRACE: urProgramLinkExp + + // CHECK-AOT-TRACE: urProgramCreateWithBinaryExp + // CHECK-AOT-TRACE: urProgramBuildExp + sycl::kernel_bundle kernelBundleExecutableNewSet = + sycl::get_kernel_bundle( + ctx, {dev2, dev3, dev4}, {kid}); + + for (int i = 0; i < 3; i++) { + queues[0].submit([=](sycl::handler &cgh) { + cgh.use_kernel_bundle(kernelBundleExecutableSubset1); + cgh.single_task(KernelLambda); + }); + queues[0].wait(); + + queues[2].submit([=](sycl::handler &cgh) { + cgh.use_kernel_bundle(kernelBundleExecutableNewSet); + cgh.single_task(KernelLambda); + }); + queues[2].wait(); + } + std::cout << "Test #2 passed." << std::endl; + } + return 0; +} diff --git a/sycl/unittests/SYCL2020/KernelBundleStateFiltering.cpp b/sycl/unittests/SYCL2020/KernelBundleStateFiltering.cpp index 897ff6aba4f4d..c2e24d32fb081 100644 --- a/sycl/unittests/SYCL2020/KernelBundleStateFiltering.cpp +++ b/sycl/unittests/SYCL2020/KernelBundleStateFiltering.cpp @@ -104,8 +104,10 @@ ur_result_t redefinedUrProgramCreate(void *pParams) { } ur_result_t redefinedUrProgramCreateWithBinary(void *pParams) { - auto params = *static_cast(pParams); - redefinedUrProgramCreateCommon(*params.ppBinary); + auto params = + *static_cast(pParams); + for (uint32_t i = 0; i < *params.pnumDevices; ++i) + redefinedUrProgramCreateCommon(*params.pppBinaries[i]); return UR_RESULT_SUCCESS; } @@ -153,9 +155,8 @@ TEST(KernelBundle, DeviceImageStateFiltering) { sycl::unittest::UrMock<> Mock; mock::getCallbacks().set_after_callback("urProgramCreateWithIL", &redefinedUrProgramCreate); - mock::getCallbacks().set_after_callback("urProgramCreateWithBinary", + mock::getCallbacks().set_after_callback("urProgramCreateWithBinaryExp", &redefinedUrProgramCreateWithBinary); - // No kernel ids specified. { const sycl::device Dev = sycl::platform().get_devices()[0]; diff --git a/sycl/unittests/helpers/RuntimeLinkingCommon.hpp b/sycl/unittests/helpers/RuntimeLinkingCommon.hpp index 87f0f980119f0..e6802d4c5716e 100644 --- a/sycl/unittests/helpers/RuntimeLinkingCommon.hpp +++ b/sycl/unittests/helpers/RuntimeLinkingCommon.hpp @@ -39,8 +39,9 @@ static ur_result_t redefined_urProgramCreateWithIL(void *pParams) { } static ur_result_t redefined_urProgramCreateWithBinary(void *pParams) { - auto Params = *static_cast(pParams); - auto *Magic = reinterpret_cast(*Params.ppBinary); + auto Params = + *static_cast(pParams); + auto *Magic = reinterpret_cast(*Params.pppBinaries[0]); ur_program_handle_t *res = *Params.pphProgram; *res = mock::createDummyHandle(sizeof(unsigned)); reinterpret_cast(*res)->setDataAs(*Magic); @@ -81,7 +82,7 @@ static void setupRuntimeLinkingMock() { mock::getCallbacks().set_replace_callback("urProgramCreateWithIL", redefined_urProgramCreateWithIL); mock::getCallbacks().set_replace_callback( - "urProgramCreateWithBinary", redefined_urProgramCreateWithBinary); + "urProgramCreateWithBinaryExp", redefined_urProgramCreateWithBinary); mock::getCallbacks().set_replace_callback("urProgramLinkExp", redefined_urProgramLinkExp); mock::getCallbacks().set_replace_callback("urKernelCreate", diff --git a/sycl/unittests/kernel-and-program/MultipleDevsCache.cpp b/sycl/unittests/kernel-and-program/MultipleDevsCache.cpp index 5bcbc3150abef..70c2672ef0c01 100644 --- a/sycl/unittests/kernel-and-program/MultipleDevsCache.cpp +++ b/sycl/unittests/kernel-and-program/MultipleDevsCache.cpp @@ -17,7 +17,9 @@ #include +#include #include +#include using namespace sycl; @@ -25,6 +27,8 @@ class MultipleDevsCacheTestKernel; MOCK_INTEGRATION_HEADER(MultipleDevsCacheTestKernel) +static constexpr uint32_t NumDevices = 3; + static sycl::unittest::UrImage Img = sycl::unittest::generateDefaultImage({"MultipleDevsCacheTestKernel"}); static sycl::unittest::UrImageArray<1> ImgArray{&Img}; @@ -32,13 +36,13 @@ static sycl::unittest::UrImageArray<1> ImgArray{&Img}; static ur_result_t redefinedDeviceGetAfter(void *pParams) { auto params = *static_cast(pParams); if (*params.ppNumDevices) { - **params.ppNumDevices = static_cast(2); + **params.ppNumDevices = static_cast(NumDevices); return UR_RESULT_SUCCESS; } - if (*params.pNumEntries == 2 && *params.pphDevices) { - (*params.pphDevices)[0] = reinterpret_cast(1111); - (*params.pphDevices)[1] = reinterpret_cast(2222); + if (*params.pNumEntries == NumDevices && *params.pphDevices) { + for (std::uintptr_t i = 0; i < NumDevices; ++i) + (*params.pphDevices)[i] = reinterpret_cast(i + 1); } return UR_RESULT_SUCCESS; } @@ -103,38 +107,52 @@ class MultipleDeviceCacheTest : public ::testing::Test { platform Plt; }; -// Test that program is retained for each device and each kernel is released -// once +// Test that program is retained for each subset of the list of devices and that +// number of urKernelRelease calls is correct. TEST_F(MultipleDeviceCacheTest, ProgramRetain) { { std::vector Devices = Plt.get_devices(info::device_type::gpu); sycl::context Context(Devices); sycl::queue Queue(Context, Devices[0]); - assert(Devices.size() == 2 && Context.get_devices().size() == 2); + assert(Devices.size() == NumDevices && + Context.get_devices().size() == NumDevices); auto KernelID = sycl::get_kernel_id(); auto Bundle = sycl::get_kernel_bundle( Queue.get_context(), {KernelID}); - assert(Bundle.get_devices().size() == 2); - + assert(Bundle.get_devices().size() == NumDevices); + + // Internally we create a kernel_bundle for the device associated with the + // queue and obtain the kernel object from it. So, as a result UR program + // for a single device is created (handle owned by device_image) and + // retained (because it is cached), as well as UR kernel is created using + // that UR program and retained because copy of the handle is returned to + // the caller. Queue.submit([&](sycl::handler &cgh) { cgh.single_task([]() {}); }); + // Here we create a kernel_bundle for each device in the context and obtain + // the kernel object from it. We can't reuse the UR program that was created + // earlier as it is associated with just one device. So we create new UR + // program associated with all devices in the context and retain it (put it + // into the cache). We also create a new UR kernel from the UR program and + // retain it becaise copy of the handle returned to the caller. auto BundleObject = sycl::build(Bundle, Bundle.get_devices()); auto Kernel = BundleObject.get_kernel(KernelID); - // Because of emulating 2 devices program is retained for each one in - // build(). It is also depends on number of device images. This test has one - // image, but other tests can create other images. Additional variable is - // added to control count of urProgramRetain calls + // Because of emulating multiple devices program is retained for each + // non-empty subset of provided list of devices in build(). It also depends + // on number of device images. This test has one image, but other tests can + // create other images. Additional variable is added to control count of + // urProgramRetain calls. auto BundleImpl = getSyclObjImpl(Bundle); // Bundle should only contain a single image, specifically the one with // MultipleDevsCacheTestKernel. EXPECT_EQ(BundleImpl->size(), size_t{1}); - int NumRetains = 1 + BundleImpl->size() * 2; + int NumRetains = BundleImpl->size() * std::pow(2, NumDevices) - 1; EXPECT_EQ(RetainCounter, NumRetains) << "Expect " << NumRetains << " piProgramRetain calls"; @@ -142,8 +160,8 @@ TEST_F(MultipleDeviceCacheTest, ProgramRetain) { detail::KernelProgramCache::KernelCacheT &KernelCache = CtxImpl->getKernelProgramCache().acquireKernelsPerProgramCache().get(); - EXPECT_EQ(KernelCache.size(), (size_t)1) - << "Expect 1 program in kernel cache"; + EXPECT_EQ(KernelCache.size(), (size_t)2) + << "Expect 2 programs in kernel cache"; for (auto &KernelProgIt : KernelCache) EXPECT_EQ(KernelProgIt.second.size(), (size_t)1) << "Expect 1 kernel cache"; @@ -154,5 +172,9 @@ TEST_F(MultipleDeviceCacheTest, ProgramRetain) { // kernel is removed from cache if urKernelRelease was called for it, so it // will not be removed twice for the other programs. As a result we must // expect 3 urKernelRelease calls. - EXPECT_EQ(KernelReleaseCounter, 3) << "Expect 3 piKernelRelease calls"; + + // We create 2 kernels in the test. So, we expect + // 4 urKernelRelease calls (correpsonding to 2 create calls + 2 retain calls + // when handle is returned to the caller). + EXPECT_EQ(KernelReleaseCounter, 4) << "Expect 4 piKernelRelease calls"; } diff --git a/sycl/unittests/program_manager/CompileTarget.cpp b/sycl/unittests/program_manager/CompileTarget.cpp index 281530a8e43e4..aac0aaf5f5315 100644 --- a/sycl/unittests/program_manager/CompileTarget.cpp +++ b/sycl/unittests/program_manager/CompileTarget.cpp @@ -137,9 +137,11 @@ static ur_result_t redefinedDeviceGet(void *pParams) { std::vector createWithBinaryLog; static ur_result_t redefinedProgramCreateWithBinary(void *pParams) { - auto params = *static_cast(pParams); - createWithBinaryLog.push_back( - reinterpret_cast(*params.ppBinary)); + auto params = + *static_cast(pParams); + for (uint32_t i = 0; i < *params.pnumDevices; ++i) + createWithBinaryLog.push_back( + reinterpret_cast(*params.pppBinaries[i])); return UR_RESULT_SUCCESS; } @@ -204,7 +206,7 @@ class CompileTargetTest : public testing::Test { protected: sycl::unittest::UrMock<> Mock; CompileTargetTest() { - mock::getCallbacks().set_before_callback("urProgramCreateWithBinary", + mock::getCallbacks().set_before_callback("urProgramCreateWithBinaryExp", &redefinedProgramCreateWithBinary); mock::getCallbacks().set_before_callback("urProgramCreateWithIL", &redefinedProgramCreateWithIL);