From 197ceec0b412b294b66a6fc7c68fdd580de7d03d Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 23 Apr 2024 11:04:40 +0100
Subject: [PATCH 01/21] CUDA adapter multi device context

Implement multi device context in CUDA adapter. This allows multiple
devices to share the same context.

When devices in the same context use the same ur_mem_handle_t, the SYCL
runtime expects the plugin library to be able to migrate memory between
allocations on different devices. This necessitates some added
functionality for the ur_mem_handle_t class. Some more detailed
documentation is added in memory.hpp explaining the memory migration
patterns.
---
 source/adapters/cuda/command_buffer.cpp |  57 ++--
 source/adapters/cuda/context.cpp        |  23 +-
 source/adapters/cuda/context.hpp        |  77 +++--
 source/adapters/cuda/device.cpp         |   4 +-
 source/adapters/cuda/device.hpp         |  11 +-
 source/adapters/cuda/enqueue.cpp        | 331 +++++++++++-------
 source/adapters/cuda/event.cpp          |   3 +-
 source/adapters/cuda/image.cpp          |  86 +++--
 source/adapters/cuda/kernel.cpp         |  15 +-
 source/adapters/cuda/kernel.hpp         |  25 +-
 source/adapters/cuda/memory.cpp         | 437 ++++++++++++------------
 source/adapters/cuda/memory.hpp         | 424 +++++++++++++++++++----
 source/adapters/cuda/physical_mem.cpp   |   6 +-
 source/adapters/cuda/physical_mem.hpp   |  34 +-
 source/adapters/cuda/platform.cpp       |  26 +-
 source/adapters/cuda/program.cpp        |  48 ++-
 source/adapters/cuda/program.hpp        |  15 +-
 source/adapters/cuda/queue.cpp          |  15 +-
 source/adapters/cuda/queue.hpp          |   1 +
 source/adapters/cuda/usm.cpp            |  63 ++--
 source/adapters/cuda/usm_p2p.cpp        |  10 +-
 source/adapters/cuda/virtual_mem.cpp    |  56 +--
 test/adapters/cuda/context_tests.cpp    |  10 +-
 23 files changed, 1118 insertions(+), 659 deletions(-)
diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 8f1ede3010..8d21a93c75 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -203,9 +203,10 @@ static ur_result_t enqueueCommandBufferFillHelper(
       }
       }
 
-      UR_CHECK_ERROR(cuGraphAddMemsetNode(
-          &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
-          DepsList.size(), &NodeParams, CommandBuffer->Device->getContext()));
+      UR_CHECK_ERROR(
+          cuGraphAddMemsetNode(&GraphNode, CommandBuffer->CudaGraph,
+                               DepsList.data(), DepsList.size(), &NodeParams,
+                               CommandBuffer->Device->getNativeContext()));
 
       // Get sync point and register the cuNode with it.
       *SyncPoint =
@@ -237,7 +238,7 @@ static ur_result_t enqueueCommandBufferFillHelper(
       UR_CHECK_ERROR(cuGraphAddMemsetNode(
           &GraphNodeFirst, CommandBuffer->CudaGraph, DepsList.data(),
           DepsList.size(), &NodeParamsStepFirst,
-          CommandBuffer->Device->getContext()));
+          CommandBuffer->Device->getNativeContext()));
 
       // Get sync point and register the cuNode with it.
       *SyncPoint = CommandBuffer->addSyncPoint(
@@ -269,7 +270,7 @@ static ur_result_t enqueueCommandBufferFillHelper(
         UR_CHECK_ERROR(cuGraphAddMemsetNode(
             &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
             DepsList.size(), &NodeParamsStep,
-            CommandBuffer->Device->getContext()));
+            CommandBuffer->Device->getNativeContext()));
 
         GraphNodePtr = std::make_shared<CUgraphNode>(GraphNode);
         // Get sync point and register the cuNode with it.
@@ -478,7 +479,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -513,8 +514,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
   }
 
   try {
-    auto Src = std::get<BufferMem>(hSrcMem->Mem).get() + srcOffset;
-    auto Dst = std::get<BufferMem>(hDstMem->Mem).get() + dstOffset;
+    auto Src = std::get<BufferMem>(hSrcMem->Mem)
+                   .getPtrWithOffset(hCommandBuffer->Device, srcOffset);
+    auto Dst = std::get<BufferMem>(hDstMem->Mem)
+                   .getPtrWithOffset(hCommandBuffer->Device, dstOffset);
 
     CUDA_MEMCPY3D NodeParams = {};
     setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, &Dst, CU_MEMORYTYPE_DEVICE, size,
@@ -522,7 +525,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -553,8 +556,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
   }
 
   try {
-    CUdeviceptr SrcPtr = std::get<BufferMem>(hSrcMem->Mem).get();
-    CUdeviceptr DstPtr = std::get<BufferMem>(hDstMem->Mem).get();
+    auto SrcPtr =
+        std::get<BufferMem>(hSrcMem->Mem).getPtr(hCommandBuffer->Device);
+    auto DstPtr =
+        std::get<BufferMem>(hDstMem->Mem).getPtr(hCommandBuffer->Device);
     CUDA_MEMCPY3D NodeParams = {};
 
     setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin,
@@ -563,7 +568,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -593,7 +598,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
   }
 
   try {
-    auto Dst = std::get<BufferMem>(hBuffer->Mem).get() + offset;
+    auto Dst = std::get<BufferMem>(hBuffer->Mem)
+                   .getPtrWithOffset(hCommandBuffer->Device, offset);
 
     CUDA_MEMCPY3D NodeParams = {};
     setCopyParams(pSrc, CU_MEMORYTYPE_HOST, &Dst, CU_MEMORYTYPE_DEVICE, size,
@@ -601,7 +607,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -630,7 +636,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
   }
 
   try {
-    auto Src = std::get<BufferMem>(hBuffer->Mem).get() + offset;
+    auto Src = std::get<BufferMem>(hBuffer->Mem)
+                   .getPtrWithOffset(hCommandBuffer->Device, offset);
 
     CUDA_MEMCPY3D NodeParams = {};
     setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, pDst, CU_MEMORYTYPE_HOST, size,
@@ -638,7 +645,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -670,7 +677,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
   }
 
   try {
-    CUdeviceptr DstPtr = std::get<BufferMem>(hBuffer->Mem).get();
+    auto DstPtr =
+        std::get<BufferMem>(hBuffer->Mem).getPtr(hCommandBuffer->Device);
     CUDA_MEMCPY3D NodeParams = {};
 
     setCopyRectParams(region, pSrc, CU_MEMORYTYPE_HOST, hostOffset,
@@ -680,7 +688,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -712,7 +720,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
   }
 
   try {
-    CUdeviceptr SrcPtr = std::get<BufferMem>(hBuffer->Mem).get();
+    auto SrcPtr =
+        std::get<BufferMem>(hBuffer->Mem).getPtr(hCommandBuffer->Device);
     CUDA_MEMCPY3D NodeParams = {};
 
     setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, bufferOffset,
@@ -722,7 +731,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -821,7 +830,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
                 PatternSizeIsValid,
             UR_RESULT_ERROR_INVALID_SIZE);
 
-  auto DstDevice = std::get<BufferMem>(hBuffer->Mem).get() + offset;
+  auto DstDevice = std::get<BufferMem>(hBuffer->Mem)
+                       .getPtrWithOffset(hCommandBuffer->Device, offset);
 
   return enqueueCommandBufferFillHelper(
       hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize,
@@ -854,7 +864,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
 
   try {
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     uint32_t StreamToken;
     ur_stream_guard_ Guard;
     CUstream CuStream = hQueue->getNextComputeStream(
@@ -972,7 +982,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
       if (ArgValue == nullptr) {
         Kernel->setKernelArg(ArgIndex, 0, nullptr);
       } else {
-        CUdeviceptr CuPtr = std::get<BufferMem>(ArgValue->Mem).get();
+        CUdeviceptr CuPtr =
+            std::get<BufferMem>(ArgValue->Mem).getPtr(CommandBuffer->Device);
         Kernel->setKernelArg(ArgIndex, sizeof(CUdeviceptr), (void *)&CuPtr);
       }
     } catch (ur_result_t Err) {
diff --git a/source/adapters/cuda/context.cpp b/source/adapters/cuda/context.cpp
index 40ae0ce4ad..7499534585 100644
--- a/source/adapters/cuda/context.cpp
+++ b/source/adapters/cuda/context.cpp
@@ -46,23 +46,19 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices,
                 const ur_context_properties_t *pProperties,
                 ur_context_handle_t *phContext) {
-  std::ignore = DeviceCount;
   std::ignore = pProperties;
 
-  assert(DeviceCount == 1);
-  ur_result_t RetErr = UR_RESULT_SUCCESS;
-
   std::unique_ptr<ur_context_handle_t_> ContextPtr{nullptr};
   try {
     ContextPtr = std::unique_ptr<ur_context_handle_t_>(
-        new ur_context_handle_t_{*phDevices});
+        new ur_context_handle_t_{phDevices, DeviceCount});
     *phContext = ContextPtr.release();
   } catch (ur_result_t Err) {
-    RetErr = Err;
+    return Err;
   } catch (...) {
-    RetErr = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
-  return RetErr;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
@@ -72,9 +68,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
 
   switch (static_cast<uint32_t>(ContextInfoType)) {
   case UR_CONTEXT_INFO_NUM_DEVICES:
-    return ReturnValue(1);
+    return ReturnValue(static_cast<uint32_t>(hContext->getDevices().size()));
   case UR_CONTEXT_INFO_DEVICES:
-    return ReturnValue(hContext->getDevice());
+    return ReturnValue(hContext->getDevices());
   case UR_CONTEXT_INFO_REFERENCE_COUNT:
     return ReturnValue(hContext->getReferenceCount());
   case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
@@ -88,7 +84,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
     int Major = 0;
     UR_CHECK_ERROR(cuDeviceGetAttribute(
         &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-        hContext->getDevice()->get()));
+        hContext->getDevices()[0]->get()));
     uint32_t Capabilities =
         (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
@@ -137,7 +133,10 @@ urContextRetain(ur_context_handle_t hContext) {
 
 UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle(
     ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) {
-  *phNativeContext = reinterpret_cast<ur_native_handle_t>(hContext->get());
+  // FIXME: this entry point has been deprecated in the SYCL RT and should be
+  // changed to unsupoorted once deprecation period has elapsed.
+  *phNativeContext = reinterpret_cast<ur_native_handle_t>(
+      hContext->getDevices()[0]->getNativeContext());
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/cuda/context.hpp b/source/adapters/cuda/context.hpp
index 4ddcaf3a87..f28e58afe7 100644
--- a/source/adapters/cuda/context.hpp
+++ b/source/adapters/cuda/context.hpp
@@ -33,27 +33,26 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 ///
 /// One of the main differences between the UR API and the CUDA driver API is
 /// that the second modifies the state of the threads by assigning
-/// `CUcontext` objects to threads. `CUcontext` objects store data associated
+/// \c CUcontext objects to threads. \c CUcontext objects store data associated
 /// with a given device and control access to said device from the user side.
 /// UR API context are objects that are passed to functions, and not bound
 /// to threads.
-/// The ur_context_handle_t_ object doesn't implement this behavior. It only
-/// holds the CUDA context data. The RAII object \ref ScopedContext implements
-/// the active context behavior.
 ///
-/// <b> Primary vs User-defined context </b>
+/// Since the \c ur_context_handle_t can contain multiple devices, and a \c
+/// CUcontext refers to only a single device, the \c CUcontext is more tightly
+/// coupled to a \c ur_device_handle_t than a \c ur_context_handle_t. In order
+/// to remove some ambiguities about the different semantics of \c
+/// \c ur_context_handle_t and native \c CUcontext, we access the native \c
+/// CUcontext solely through the \c ur_device_handle_t class, by using the
+/// object \ref ScopedContext, which sets the active device (by setting the
+/// active native \c CUcontext).
 ///
-/// CUDA has two different types of context, the Primary context,
-/// which is usable by all threads on a given process for a given device, and
-/// the aforementioned custom contexts.
-/// The CUDA documentation, confirmed with performance analysis, suggest using
-/// the Primary context whenever possible.
-/// The Primary context is also used by the CUDA Runtime API.
-/// For UR applications to interop with CUDA Runtime API, they have to use
-/// the primary context - and make that active in the thread.
-/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter
-/// that allows to construct a Primary or `user-defined` context, so that
-/// the UR object interface is always the same.
+/// <b> Primary vs User-defined \c CUcontext </b>
+///
+/// CUDA has two different types of \c CUcontext, the Primary context, which is
+/// usable by all threads on a given process for a given device, and the
+/// aforementioned custom \c CUcontext s. The CUDA documentation, confirmed with
+/// performance analysis, suggest using the Primary context whenever possible.
 ///
 ///  <b> Destructor callback </b>
 ///
@@ -63,6 +62,18 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 ///  See proposal for details.
 ///  https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md
 ///
+///
+///  <b> Memory Management for Devices in a Context <\b>
+///
+///  A \c ur_mem_handle_t is associated with a \c ur_context_handle_t_, which
+///  may refer to multiple devices. Therefore the \c ur_mem_handle_t must
+///  handle a native allocation for each device in the context. UR is
+///  responsible for automatically handling event dependencies for kernels
+///  writing to or reading from the same \c ur_mem_handle_t and migrating memory
+///  between native allocations for devices in the same \c ur_context_handle_t_
+///  if necessary.
+///
+///
 struct ur_context_handle_t_ {
 
   struct deleter_data {
@@ -72,18 +83,21 @@ struct ur_context_handle_t_ {
     void operator()() { Function(UserData); }
   };
 
-  using native_type = CUcontext;
-
-  native_type CUContext;
-  ur_device_handle_t DeviceID;
+  std::vector<ur_device_handle_t> Devices;
   std::atomic_uint32_t RefCount;
 
-  ur_context_handle_t_(ur_device_handle_t_ *DevID)
-      : CUContext{DevID->getContext()}, DeviceID{DevID}, RefCount{1} {
-    urDeviceRetain(DeviceID);
+  ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices)
+      : Devices{Devs, Devs + NumDevices}, RefCount{1} {
+    for (auto &Dev : Devices) {
+      urDeviceRetain(Dev);
+    }
   };
 
-  ~ur_context_handle_t_() { urDeviceRelease(DeviceID); }
+  ~ur_context_handle_t_() {
+    for (auto &Dev : Devices) {
+      urDeviceRelease(Dev);
+    }
+  }
 
   void invokeExtendedDeleters() {
     std::lock_guard<std::mutex> Guard(Mutex);
@@ -98,9 +112,9 @@ struct ur_context_handle_t_ {
     ExtendedDeleters.emplace_back(deleter_data{Function, UserData});
   }
 
-  ur_device_handle_t getDevice() const noexcept { return DeviceID; }
-
-  native_type get() const noexcept { return CUContext; }
+  const std::vector<ur_device_handle_t> &getDevices() const noexcept {
+    return Devices;
+  }
 
   uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
@@ -123,12 +137,11 @@ struct ur_context_handle_t_ {
 namespace {
 class ScopedContext {
 public:
-  ScopedContext(ur_context_handle_t Context) {
-    if (!Context) {
-      throw UR_RESULT_ERROR_INVALID_CONTEXT;
+  ScopedContext(ur_device_handle_t Device) {
+    if (!Device) {
+      throw UR_RESULT_ERROR_INVALID_DEVICE;
     }
-
-    setContext(Context->get());
+    setContext(Device->getNativeContext());
   }
 
   ScopedContext(CUcontext NativeContext) { setContext(NativeContext); }
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index 949b58666e..3a94587d1f 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -47,7 +47,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
 
   static constexpr uint32_t MaxWorkItemDimensions = 3u;
 
-  ScopedContext Active(hDevice->getContext());
+  ScopedContext Active(hDevice);
 
   switch ((uint32_t)propName) {
   case UR_DEVICE_INFO_TYPE: {
@@ -1234,7 +1234,7 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
                                                    uint64_t *pDeviceTimestamp,
                                                    uint64_t *pHostTimestamp) {
   CUevent Event;
-  ScopedContext Active(hDevice->getContext());
+  ScopedContext Active(hDevice);
 
   if (pDeviceTimestamp) {
     UR_CHECK_ERROR(cuEventCreate(&Event, CU_EVENT_DEFAULT));
diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp
index 373c8a8ab9..0a40329026 100644
--- a/source/adapters/cuda/device.hpp
+++ b/source/adapters/cuda/device.hpp
@@ -22,6 +22,7 @@ struct ur_device_handle_t_ {
   CUevent EvBase; // CUDA event used as base counter
   std::atomic_uint32_t RefCount;
   ur_platform_handle_t Platform;
+  uint32_t DeviceIndex;
 
   static constexpr uint32_t MaxWorkItemDimensions = 3u;
   size_t MaxWorkItemSizes[MaxWorkItemDimensions];
@@ -34,9 +35,9 @@ struct ur_device_handle_t_ {
 
 public:
   ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase,
-                      ur_platform_handle_t platform)
+                      ur_platform_handle_t platform, uint32_t DevIndex)
       : CuDevice(cuDevice), CuContext(cuContext), EvBase(evBase), RefCount{1},
-        Platform(platform) {
+        Platform(platform), DeviceIndex{DevIndex} {
 
     UR_CHECK_ERROR(cuDeviceGetAttribute(
         &MaxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
@@ -79,12 +80,16 @@ struct ur_device_handle_t_ {
 
   native_type get() const noexcept { return CuDevice; };
 
-  CUcontext getContext() const noexcept { return CuContext; };
+  CUcontext getNativeContext() const noexcept { return CuContext; };
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 
   ur_platform_handle_t getPlatform() const noexcept { return Platform; };
 
+  // Returns the index of the device relative to the other devices in the same
+  // platform
+  uint32_t getIndex() const noexcept { return DeviceIndex; }
+
   uint64_t getElapsedTime(CUevent) const;
 
   size_t getMaxWorkItemSizes(int index) const noexcept {
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 812a41768e..55d8cfd3cc 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -26,7 +26,7 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
   UR_ASSERT(EventWaitList, UR_RESULT_SUCCESS);
 
   try {
-    ScopedContext Active(CommandQueue->getContext());
+    ScopedContext Active(CommandQueue->getDevice());
 
     auto Result = forLatestEvents(
         EventWaitList, NumEventsInWaitList,
@@ -188,7 +188,7 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
 // @param [out] ThreadsPerBlock Number of threads per block we should run
 // @param [out] BlocksPerGrid Number of blocks per grid we should run
 ur_result_t
-setKernelParams(const ur_context_handle_t Context,
+setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
                 const ur_device_handle_t Device, const uint32_t WorkDim,
                 const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
                 const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel,
@@ -201,7 +201,7 @@ setKernelParams(const ur_context_handle_t Context,
 
   try {
     // Set the active context here as guessLocalWorkSize needs an active context
-    ScopedContext Active(Context);
+    ScopedContext Active(Device);
     {
       size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock;
       MaxWorkGroupSize = Device->getMaxWorkGroupSize();
@@ -271,7 +271,6 @@ setKernelParams(const ur_context_handle_t Context,
                                    CudaImplicitOffset);
     }
 
-    auto Device = Context->getDevice();
     if (LocalSize > static_cast<uint32_t>(Device->getMaxCapacityLocalMem())) {
       setErrorMessage("Excessive allocation of local memory on the device",
                       UR_RESULT_ERROR_ADAPTER_SPECIFIC);
@@ -329,7 +328,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
   // This function makes one stream work on the previous work (or work
   // represented by input events) and then all future work waits on that stream.
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     uint32_t StreamToken;
     ur_stream_guard_ Guard;
     CUstream CuStream = hQueue->getNextComputeStream(
@@ -410,11 +409,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   // Preconditions
-  UR_ASSERT(hQueue->getContext() == hKernel->getContext(),
+  UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
             UR_RESULT_ERROR_INVALID_KERNEL);
   UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
   UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 
+  std::vector<ur_event_handle_t> DepEvents(
+      phEventWaitList, phEventWaitList + numEventsInWaitList);
+  std::vector<std::pair<ur_mem_handle_t, ur_lock>> MemMigrationLocks;
+
+  // phEventWaitList only contains events that are handed to UR by the SYCL
+  // runtime. However since UR handles memory dependencies within a context
+  // we may need to add more events to our dependent events list if the UR
+  // context contains multiple devices
+  if (hQueue->getContext()->Devices.size() > 1) {
+    MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size());
+    for (auto &MemArg : hKernel->Args.MemObjArgs) {
+      bool PushBack = false;
+      if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
+          MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
+                                   MemDepEvent) == DepEvents.end()) {
+        DepEvents.push_back(MemDepEvent);
+        PushBack = true;
+      }
+      if ((MemArg.AccessFlags &
+           (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) ||
+          PushBack) {
+        if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(),
+                         [MemArg](auto &Lock) {
+                           return Lock.first == MemArg.Mem;
+                         }) == MemMigrationLocks.end())
+          MemMigrationLocks.emplace_back(
+              std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}});
+      }
+    }
+  }
+
+  // Early exit for zero size kernel
   if (*pGlobalWorkSize == 0) {
     return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
                                           phEventWaitList, phEvent);
@@ -426,26 +457,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
   uint32_t LocalSize = hKernel->getLocalSize();
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUfunction CuFunc = hKernel->get();
 
-  Result = setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
-                           pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
-                           hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  UR_CHECK_ERROR(setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
+                                 pGlobalWorkOffset, pGlobalWorkSize,
+                                 pLocalWorkSize, hKernel, CuFunc,
+                                 ThreadsPerBlock, BlocksPerGrid));
 
   try {
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
+    ScopedContext Active(hQueue->getDevice());
     uint32_t StreamToken;
     ur_stream_guard_ Guard;
     CUstream CuStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                               phEventWaitList);
+    if (DepEvents.size()) {
+      UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                       phEventWaitList));
+    }
+
+    // For memory migration across devices in the same context
+    if (hQueue->getContext()->Devices.size() > 1) {
+      for (auto &MemArg : hKernel->Args.MemObjArgs) {
+        migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
+      }
+    }
 
     if (phEvent) {
       RetImplEvent =
@@ -454,6 +492,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
+    // Once event has been started we can unlock MemoryMigrationMutex
+    if (hQueue->getContext()->Devices.size() > 1) {
+      for (auto &MemArg : hKernel->Args.MemObjArgs) {
+        // Telling the ur_mem_handle_t that it will need to wait on this kernel
+        // if it has been written to
+        if (phEvent && (MemArg.AccessFlags &
+                        (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) {
+          MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get(),
+                                                  hQueue->getDevice());
+        }
+      }
+      // We can release the MemoryMigrationMutexes now
+      MemMigrationLocks.clear();
+    }
+
     auto &ArgIndices = hKernel->getArgIndices();
     UR_CHECK_ERROR(cuLaunchKernel(
         CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
@@ -469,9 +522,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     }
 
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
@@ -565,35 +618,51 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
     size_t hostRowPitch, size_t hostSlicePitch, void *pDst,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  CUdeviceptr DevPtr = std::get<BufferMem>(hBuffer->Mem).get();
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
-  try {
-    ScopedContext Active(hQueue->getContext());
-    CUstream CuStream = hQueue->getNextTransferStream();
+  ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
+  auto Device = hQueue->getDevice();
+  CUstream Stream = hQueue->getNextTransferStream();
 
-    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                               phEventWaitList);
+  try {
+    // Note that this entry point may be called on a queue that may not be the
+    // last queue to write to the MemBuffer, meaning we must perform the copy
+    // from a different device
+    if (hBuffer->LastEventWritingToMemObj &&
+        hBuffer->LastDeviceWritingToMemObj != hQueue->getDevice()) {
+      Device = hBuffer->LastDeviceWritingToMemObj;
+      ScopedContext Active(Device);
+      Stream = CUstream{0}; // Default stream for different device
+      // We may have to wait for an event on another queue if it is the last
+      // event writing to mem obj
+      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
+                                       &hBuffer->LastEventWritingToMemObj));
+    }
+
+    ScopedContext Active(Device);
+
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
-              UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, CuStream));
+              UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, Stream));
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    Result = commonEnqueueMemBufferCopyRect(
-        CuStream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
+    void *DevPtr = std::get<BufferMem>(hBuffer->Mem).getVoid(Device);
+    UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect(
+        Stream, region, DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
         bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin,
-        hostRowPitch, hostSlicePitch);
+        hostRowPitch, hostSlicePitch));
 
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
     }
 
     if (blockingRead) {
-      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+      UR_CHECK_ERROR(cuStreamSynchronize(Stream));
     }
 
     if (phEvent) {
@@ -601,9 +670,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
     }
 
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
@@ -613,15 +682,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
     size_t hostRowPitch, size_t hostSlicePitch, void *pSrc,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  CUdeviceptr DevPtr = std::get<BufferMem>(hBuffer->Mem).get();
+  CUdeviceptr DevPtr =
+      std::get<BufferMem>(hBuffer->Mem).getPtr(hQueue->getDevice());
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream cuStream = hQueue->getNextTransferStream();
-    Result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
-                               phEventWaitList);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent =
@@ -630,10 +699,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    Result = commonEnqueueMemBufferCopyRect(
+    UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect(
         cuStream, region, pSrc, CU_MEMORYTYPE_HOST, hostOrigin, hostRowPitch,
         hostSlicePitch, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
-        bufferRowPitch, bufferSlicePitch);
+        bufferRowPitch, bufferSlicePitch));
 
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
@@ -648,9 +717,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
     }
 
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
@@ -666,7 +735,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     ur_result_t Result = UR_RESULT_SUCCESS;
 
     auto Stream = hQueue->getNextTransferStream();
@@ -680,8 +749,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    auto Src = std::get<BufferMem>(hBufferSrc->Mem).get() + srcOffset;
-    auto Dst = std::get<BufferMem>(hBufferDst->Mem).get() + dstOffset;
+    auto Src = std::get<BufferMem>(hBufferSrc->Mem)
+                   .getPtrWithOffset(hQueue->getDevice(), srcOffset);
+    auto Dst = std::get<BufferMem>(hBufferDst->Mem)
+                   .getPtrWithOffset(hQueue->getDevice(), dstOffset);
 
     UR_CHECK_ERROR(cuMemcpyDtoDAsync(Dst, Src, size, Stream));
 
@@ -706,12 +777,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   ur_result_t Result = UR_RESULT_SUCCESS;
-  CUdeviceptr SrcPtr = std::get<BufferMem>(hBufferSrc->Mem).get();
-  CUdeviceptr DstPtr = std::get<BufferMem>(hBufferDst->Mem).get();
+  CUdeviceptr SrcPtr =
+      std::get<BufferMem>(hBufferSrc->Mem).getPtr(hQueue->getDevice());
+  CUdeviceptr DstPtr =
+      std::get<BufferMem>(hBufferDst->Mem).getPtr(hQueue->getDevice());
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
@@ -784,7 +857,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
 
     auto Stream = hQueue->getNextTransferStream();
     ur_result_t Result =
@@ -797,7 +870,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    auto DstDevice = std::get<BufferMem>(hBuffer->Mem).get() + offset;
+    auto DstDevice = std::get<BufferMem>(hBuffer->Mem)
+                         .getPtrWithOffset(hQueue->getDevice(), offset);
     auto N = size / patternSize;
 
     // pattern size in bytes
@@ -933,18 +1007,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
   std::ignore = rowPitch;
   std::ignore = slicePitch;
 
-  UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
+  ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex};
+  auto Device = hQueue->getDevice();
+  CUstream Stream = hQueue->getNextTransferStream();
 
   try {
-    ScopedContext Active(hQueue->getContext());
-    CUstream CuStream = hQueue->getNextTransferStream();
-    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                               phEventWaitList);
+    // Note that this entry point may be called on a queue that may not be the
+    // last queue to write to the Image, meaning we must perform the copy
+    // from a different device
+    if (hImage->LastEventWritingToMemObj &&
+        hImage->LastDeviceWritingToMemObj != hQueue->getDevice()) {
+      Device = hImage->LastDeviceWritingToMemObj;
+      ScopedContext Active(Device);
+      Stream = CUstream{0}; // Default stream for different device
+      // We may have to wait for an event on another queue if it is the last
+      // event writing to mem obj
+      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
+                                       &hImage->LastEventWritingToMemObj));
+    }
+
+    ScopedContext Active(Device);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+                                     phEventWaitList));
 
-    CUarray Array = std::get<SurfaceMem>(hImage->Mem).getArray();
+    CUarray Array = std::get<SurfaceMem>(hImage->Mem).getArray(Device);
 
     CUDA_ARRAY_DESCRIPTOR ArrayDesc;
     UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
@@ -954,29 +1042,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels;
     size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width;
 
-    ur_mem_type_t ImgType = std::get<SurfaceMem>(hImage->Mem).getImageType();
+    ur_mem_type_t ImgType = std::get<SurfaceMem>(hImage->Mem).getType();
 
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
-              UR_COMMAND_MEM_IMAGE_READ, hQueue, CuStream));
+              UR_COMMAND_MEM_IMAGE_READ, hQueue, Stream));
       UR_CHECK_ERROR(RetImplEvent->start());
     }
     if (ImgType == UR_MEM_TYPE_IMAGE1D) {
       UR_CHECK_ERROR(
-          cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, CuStream));
+          cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, Stream));
     } else {
       ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
                                          region.depth};
       ur_rect_offset_t SrcOffset = {ByteOffsetX, origin.y, origin.z};
 
-      Result = commonEnqueueMemImageNDCopy(
-          CuStream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY,
-          SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{});
-      if (Result != UR_RESULT_SUCCESS) {
-        return Result;
-      }
+      UR_CHECK_ERROR(commonEnqueueMemImageNDCopy(
+          Stream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY,
+          SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{}));
     }
 
     if (phEvent) {
@@ -985,7 +1070,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     }
 
     if (blockingRead) {
-      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+      UR_CHECK_ERROR(cuStreamSynchronize(Stream));
     }
   } catch (ur_result_t Err) {
     return Err;
@@ -993,7 +1078,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     return UR_RESULT_ERROR_UNKNOWN;
   }
 
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
@@ -1005,18 +1090,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
   std::ignore = rowPitch;
   std::ignore = slicePitch;
 
-  UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
   ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
-    CUarray Array = std::get<SurfaceMem>(hImage->Mem).getArray();
+    CUarray Array =
+        std::get<SurfaceMem>(hImage->Mem).getArray(hQueue->getDevice());
 
     CUDA_ARRAY_DESCRIPTOR ArrayDesc;
     UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
@@ -1034,7 +1119,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    ur_mem_type_t ImgType = std::get<SurfaceMem>(hImage->Mem).getImageType();
+    ur_mem_type_t ImgType = std::get<SurfaceMem>(hImage->Mem).getType();
     if (ImgType == UR_MEM_TYPE_IMAGE1D) {
       UR_CHECK_ERROR(
           cuMemcpyHtoAAsync(Array, ByteOffsetX, pSrc, BytesToCopy, CuStream));
@@ -1071,24 +1156,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
     ur_rect_offset_t dstOrigin, ur_rect_region_t region,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  UR_ASSERT(hImageSrc->MemType == ur_mem_handle_t_::Type::Surface,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(std::get<SurfaceMem>(hImageSrc->Mem).getImageType() ==
-                std::get<SurfaceMem>(hImageDst->Mem).getImageType(),
+  UR_ASSERT(hImageSrc->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hImageDst->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(std::get<SurfaceMem>(hImageSrc->Mem).getType() ==
+                std::get<SurfaceMem>(hImageDst->Mem).getType(),
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
   ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
-    CUarray SrcArray = std::get<SurfaceMem>(hImageSrc->Mem).getArray();
-    CUarray DstArray = std::get<SurfaceMem>(hImageDst->Mem).getArray();
+    CUarray SrcArray =
+        std::get<SurfaceMem>(hImageSrc->Mem).getArray(hQueue->getDevice());
+    CUarray DstArray =
+        std::get<SurfaceMem>(hImageDst->Mem).getArray(hQueue->getDevice());
 
     CUDA_ARRAY_DESCRIPTOR SrcArrayDesc;
     UR_CHECK_ERROR(cuArrayGetDescriptor(&SrcArrayDesc, SrcArray));
@@ -1117,7 +1202,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    ur_mem_type_t ImgType = std::get<SurfaceMem>(hImageSrc->Mem).getImageType();
+    ur_mem_type_t ImgType = std::get<SurfaceMem>(hImageSrc->Mem).getType();
 
     ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
                                        region.depth};
@@ -1154,8 +1239,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
     ur_map_flags_t mapFlags, size_t offset, size_t size,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent, void **ppRetMap) {
-  UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
   UR_ASSERT(offset + size <= std::get<BufferMem>(hBuffer->Mem).getSize(),
             UR_RESULT_ERROR_INVALID_SIZE);
 
@@ -1177,7 +1261,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
                                     MapPtr, numEventsInWaitList,
                                     phEventWaitList, phEvent);
   } else {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
 
     if (IsPinned) {
       Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
@@ -1208,8 +1292,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
     ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hMem->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
   auto &BufferImpl = std::get<BufferMem>(hMem->Mem);
 
   auto *Map = BufferImpl.getMapDetails(pMappedPtr);
@@ -1225,7 +1308,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
         hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(), pMappedPtr,
         numEventsInWaitList, phEventWaitList, phEvent);
   } else {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
 
     if (IsPinned) {
       Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
@@ -1256,7 +1339,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
   std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     uint32_t StreamToken;
     ur_stream_guard_ Guard;
     CUstream CuStream = hQueue->getNextComputeStream(
@@ -1310,7 +1393,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
   std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
@@ -1347,7 +1430,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
   UR_CHECK_ERROR(cuPointerGetAttribute(
       &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
   UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
-  ur_device_handle_t Device = hQueue->getContext()->getDevice();
+  ur_device_handle_t Device = hQueue->getDevice();
 
   // Certain cuda devices and Windows do not have support for some Unified
   // Memory features. cuMemPrefetchAsync requires concurrent memory access
@@ -1373,7 +1456,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
   std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
@@ -1414,7 +1497,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
       (advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) ||
       (advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) ||
       (advice & UR_USM_ADVICE_FLAG_DEFAULT)) {
-    ur_device_handle_t Device = hQueue->getContext()->getDevice();
+    ur_device_handle_t Device = hQueue->getDevice();
     if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
       setErrorMessage("Mem advise ignored as device does not support "
                       "concurrent managed access",
@@ -1441,7 +1524,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
   std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
 
     if (phEvent) {
       EventPtr =
@@ -1453,16 +1536,16 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
     if (advice & UR_USM_ADVICE_FLAG_DEFAULT) {
       UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
                                  CU_MEM_ADVISE_UNSET_READ_MOSTLY,
-                                 hQueue->getContext()->getDevice()->get()));
+                                 hQueue->getDevice()->get()));
       UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
                                  CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
-                                 hQueue->getContext()->getDevice()->get()));
+                                 hQueue->getDevice()->get()));
       UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
                                  CU_MEM_ADVISE_UNSET_ACCESSED_BY,
-                                 hQueue->getContext()->getDevice()->get()));
+                                 hQueue->getDevice()->get()));
     } else {
       Result = setCuMemAdvise((CUdeviceptr)pMem, size, advice,
-                              hQueue->getContext()->getDevice()->get());
+                              hQueue->getDevice()->get());
     }
 
     if (phEvent) {
@@ -1493,7 +1576,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
   ur_result_t result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext active(hQueue->getContext());
+    ScopedContext active(hQueue->getDevice());
     CUstream cuStream = hQueue->getNextTransferStream();
     result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
                                phEventWaitList);
@@ -1543,33 +1626,49 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
   UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
   UR_ASSERT(offset + size <= std::get<BufferMem>(hBuffer->Mem).Size,
             UR_RESULT_ERROR_INVALID_SIZE);
-
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  CUdeviceptr DevPtr = std::get<BufferMem>(hBuffer->Mem).get();
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
+  auto Device = hQueue->getDevice();
+  CUstream Stream = hQueue->getNextTransferStream();
 
   try {
-    ScopedContext Active(hQueue->getContext());
-    CUstream CuStream = hQueue->getNextTransferStream();
-
-    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                               phEventWaitList);
+    // Note that this entry point may be called on a queue that may not be the
+    // last queue to write to the MemBuffer, meaning we must perform the copy
+    // from a different device
+    if (hBuffer->LastEventWritingToMemObj &&
+        hBuffer->LastDeviceWritingToMemObj != hQueue->getDevice()) {
+      Device = hBuffer->LastDeviceWritingToMemObj;
+      ScopedContext Active(Device);
+      Stream = CUstream{0}; // Default stream for different device
+      // We may have to wait for an event on another queue if it is the last
+      // event writing to mem obj
+      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
+                                       &hBuffer->LastEventWritingToMemObj));
+    }
+
+    ScopedContext Active(Device);
+
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
-              UR_COMMAND_MEM_BUFFER_READ, hQueue, CuStream));
+              UR_COMMAND_MEM_BUFFER_READ, hQueue, Stream));
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, DevPtr + offset, size, CuStream));
+    UR_CHECK_ERROR(cuMemcpyDtoHAsync(
+        pDst,
+        std::get<BufferMem>(hBuffer->Mem).getPtrWithOffset(Device, offset),
+        size, Stream));
 
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
     }
 
     if (blockingRead) {
-      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+      UR_CHECK_ERROR(cuStreamSynchronize(Stream));
     }
 
     if (phEvent) {
@@ -1577,10 +1676,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
     }
 
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
@@ -1592,11 +1690,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
             UR_RESULT_ERROR_INVALID_SIZE);
 
   ur_result_t Result = UR_RESULT_SUCCESS;
-  CUdeviceptr DevPtr = std::get<BufferMem>(hBuffer->Mem).get();
+  CUdeviceptr DevPtr =
+      std::get<BufferMem>(hBuffer->Mem).getPtr(hQueue->getDevice());
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
 
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
@@ -1717,7 +1816,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
   ur_result_t Result = UR_RESULT_SUCCESS;
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextComputeStream();
 
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp
index 1e8f2dd384..f9889a3f46 100644
--- a/source/adapters/cuda/event.cpp
+++ b/source/adapters/cuda/event.cpp
@@ -221,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t,
 UR_APIEXPORT ur_result_t UR_APICALL
 urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
   try {
-    ScopedContext Active(phEventWaitList[0]->getContext());
+    ScopedContext Active(phEventWaitList[0]->getQueue()->getDevice());
 
     auto WaitFunc = [](ur_event_handle_t Event) -> ur_result_t {
       UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT);
@@ -256,7 +256,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
     std::unique_ptr<ur_event_handle_t_> event_ptr{hEvent};
     ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT;
     try {
-      ScopedContext Active(hEvent->getContext());
       Result = hEvent->release();
     } catch (...) {
       Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp
index e84e5e0837..95dc2e258e 100644
--- a/source/adapters/cuda/image.cpp
+++ b/source/adapters/cuda/image.cpp
@@ -331,7 +331,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp(
     const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool,
     size_t widthInBytes, size_t height, size_t elementSizeBytes, void **ppMem,
     size_t *pResultPitch) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
   std::ignore = pUSMDesc;
   std::ignore = pool;
@@ -350,7 +352,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp(
             UR_RESULT_ERROR_INVALID_VALUE);
   ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
     UR_CHECK_ERROR(cuMemAllocPitch((CUdeviceptr *)ppMem, pResultPitch,
                                    widthInBytes, height, elementSizeBytes));
   } catch (ur_result_t error) {
@@ -366,7 +368,9 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urBindlessImagesUnsampledImageHandleDestroyExp(ur_context_handle_t hContext,
                                                ur_device_handle_t hDevice,
                                                ur_exp_image_handle_t hImage) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   UR_CHECK_ERROR(cuSurfObjectDestroy((CUsurfObject)hImage));
@@ -377,7 +381,9 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urBindlessImagesSampledImageHandleDestroyExp(ur_context_handle_t hContext,
                                              ur_device_handle_t hDevice,
                                              ur_exp_image_handle_t hImage) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   UR_CHECK_ERROR(cuTexObjectDestroy((CUtexObject)hImage));
@@ -388,7 +394,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc,
     ur_exp_image_mem_handle_t *phImageMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   // Populate descriptor
@@ -435,7 +443,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
     return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
   }
 
-  ScopedContext Active(hDevice->getContext());
+  ScopedContext Active(hDevice);
 
   // Allocate a cuArray
   if (pImageDesc->numMipLevel == 1) {
@@ -475,10 +483,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_image_mem_handle_t hImageMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
-  ScopedContext Active(hDevice->getContext());
+  ScopedContext Active(hDevice);
   try {
     UR_CHECK_ERROR(cuArrayDestroy((CUarray)hImageMem));
   } catch (ur_result_t Err) {
@@ -494,7 +504,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat,
     [[maybe_unused]] const ur_image_desc_t *pImageDesc,
     ur_exp_image_handle_t *phImage) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   unsigned int NumChannels = 0;
@@ -509,7 +521,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
 
   try {
 
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
 
     CUDA_RESOURCE_DESC image_res_desc = {};
 
@@ -537,10 +549,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
     ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat,
     const ur_image_desc_t *pImageDesc, ur_sampler_handle_t hSampler,
     ur_exp_image_handle_t *phImage) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
-  ScopedContext Active(hDevice->getContext());
+  ScopedContext Active(hDevice);
 
   unsigned int NumChannels = 0;
   UR_CHECK_ERROR(
@@ -636,7 +650,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
                                             &PixelSizeBytes));
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream Stream = hQueue->getNextTransferStream();
     enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
 
@@ -968,11 +982,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_image_mem_handle_t hImageMem, uint32_t mipmapLevel,
     ur_exp_image_mem_handle_t *phImageMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
     CUarray ImageArray;
     UR_CHECK_ERROR(cuMipmappedArrayGetLevel(
         &ImageArray, (CUmipmappedArray)hImageMem, mipmapLevel));
@@ -989,10 +1005,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp(
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_image_mem_handle_t hMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
-  ScopedContext Active(hDevice->getContext());
+  ScopedContext Active(hDevice);
   try {
     UR_CHECK_ERROR(cuMipmappedArrayDestroy((CUmipmappedArray)hMem));
   } catch (ur_result_t Err) {
@@ -1007,11 +1025,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
     ur_exp_interop_mem_desc_t *pInteropMemDesc,
     ur_exp_interop_mem_handle_t *phInteropMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
 
     CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {};
     extMemDesc.size = size;
@@ -1050,7 +1070,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
     const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc,
     ur_exp_interop_mem_handle_t hInteropMem,
     ur_exp_image_mem_handle_t *phImageMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   unsigned int NumChannels = 0;
@@ -1062,7 +1084,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
       pImageFormat->channelType, pImageFormat->channelOrder, &format, nullptr));
 
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
 
     CUDA_ARRAY3D_DESCRIPTOR ArrayDesc = {};
     ArrayDesc.Width = pImageDesc->width;
@@ -1101,11 +1123,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_interop_mem_handle_t hInteropMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
     UR_CHECK_ERROR(cuDestroyExternalMemory((CUexternalMemory)hInteropMem));
   } catch (ur_result_t Err) {
     return Err;
@@ -1120,11 +1144,13 @@ urBindlessImagesImportExternalSemaphoreOpaqueFDExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_interop_semaphore_desc_t *pInteropSemaphoreDesc,
     ur_exp_interop_semaphore_handle_t *phInteropSemaphoreHandle) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
 
     CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC extSemDesc = {};
 
@@ -1159,11 +1185,13 @@ urBindlessImagesImportExternalSemaphoreOpaqueFDExp(
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesDestroyExternalSemaphoreExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_interop_semaphore_handle_t hInteropSemaphore) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
     UR_CHECK_ERROR(
         cuDestroyExternalSemaphore((CUexternalSemaphore)hInteropSemaphore));
   } catch (ur_result_t Err) {
@@ -1180,7 +1208,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp(
     ur_event_handle_t *phEvent) {
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream Stream = hQueue->getNextTransferStream();
 
     enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
@@ -1212,7 +1240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp(
     ur_event_handle_t *phEvent) {
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream Stream = hQueue->getNextTransferStream();
 
     enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
index c9334add15..a3c5e607a9 100644
--- a/source/adapters/cuda/kernel.cpp
+++ b/source/adapters/cuda/kernel.cpp
@@ -19,7 +19,7 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
   std::unique_ptr<ur_kernel_handle_t_> Kernel{nullptr};
 
   try {
-    ScopedContext Active(hProgram->getContext());
+    ScopedContext Active(hProgram->getDevice());
 
     CUfunction CuFunc;
     CUresult FunctionResult =
@@ -293,8 +293,6 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
                      const ur_kernel_arg_mem_obj_properties_t *Properties,
                      ur_mem_handle_t hArgValue) {
-  std::ignore = Properties;
-
   // Below sets kernel arg when zero-sized buffers are handled.
   // In such case the corresponding memory is null.
   if (hArgValue == nullptr) {
@@ -304,10 +302,12 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
 
   ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) {
+    auto Device = hKernel->getProgram()->getDevice();
+    hKernel->Args.addMemObjArg(argIndex, hArgValue, Properties->memoryAccess);
+    if (hArgValue->isImage()) {
       CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
       UR_CHECK_ERROR(cuArray3DGetDescriptor(
-          &arrayDesc, std::get<SurfaceMem>(hArgValue->Mem).getArray()));
+          &arrayDesc, std::get<SurfaceMem>(hArgValue->Mem).getArray(Device)));
       if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 &&
           arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 &&
           arrayDesc.Format != CU_AD_FORMAT_HALF &&
@@ -317,10 +317,11 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
                         UR_RESULT_ERROR_ADAPTER_SPECIFIC);
         return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
       }
-      CUsurfObject CuSurf = std::get<SurfaceMem>(hArgValue->Mem).getSurface();
+      CUsurfObject CuSurf =
+          std::get<SurfaceMem>(hArgValue->Mem).getSurface(Device);
       hKernel->setKernelArg(argIndex, sizeof(CuSurf), (void *)&CuSurf);
     } else {
-      CUdeviceptr CuPtr = std::get<BufferMem>(hArgValue->Mem).get();
+      CUdeviceptr CuPtr = std::get<BufferMem>(hArgValue->Mem).getPtr(Device);
       hKernel->setKernelArg(argIndex, sizeof(CUdeviceptr), (void *)&CuPtr);
     }
   } catch (ur_result_t Err) {
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index 9e65066ba7..b7a7358b27 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -63,6 +63,14 @@ struct ur_kernel_handle_t_ {
     args_size_t ParamSizes;
     args_index_t Indices;
     args_size_t OffsetPerIndex;
+    // A struct to keep track of memargs so that we can do dependency analysis
+    // at urEnqueueKernelLaunch
+    struct mem_obj_arg {
+      ur_mem_handle_t_ *Mem;
+      int Index;
+      ur_mem_flags_t AccessFlags;
+    };
+    std::vector<mem_obj_arg> MemObjArgs;
 
     std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0};
 
@@ -116,6 +124,20 @@ struct ur_kernel_handle_t_ {
              Size + (AlignedLocalOffset - LocalOffset));
     }
 
+    void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
+      assert(hMem && "Invalid mem handle");
+      // To avoid redundancy we are not storing mem obj with index i at index
+      // i in the vec of MemObjArgs.
+      for (auto &Arg : MemObjArgs) {
+        if (Arg.Index == Index) {
+          // Overwrite the mem obj with the same index
+          Arg = arguments::mem_obj_arg{hMem, Index, Flags};
+          return;
+        }
+      }
+      MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags});
+    }
+
     void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
       assert(Size == sizeof(std::uint32_t) * 3);
       std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
@@ -142,7 +164,7 @@ struct ur_kernel_handle_t_ {
     urContextRetain(Context);
     /// Note: this code assumes that there is only one device per context
     ur_result_t RetError = urKernelGetGroupInfo(
-        this, Context->getDevice(),
+        this, Program->getDevice(),
         UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
         sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr);
     (void)RetError;
@@ -165,6 +187,7 @@ struct ur_kernel_handle_t_ {
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 
   native_type get() const noexcept { return Function; };
+  ur_program_handle_t getProgram() const noexcept { return Program; };
 
   native_type get_with_offset_parameter() const noexcept {
     return FunctionWithOffsetParam;
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index f097d2474e..38f0ccf5d3 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -36,59 +36,47 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
   const bool PerformInitialCopy =
       (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) ||
       ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !EnableUseHostPtr);
-  ur_result_t Result = UR_RESULT_SUCCESS;
   ur_mem_handle_t MemObj = nullptr;
 
   try {
-    ScopedContext Active(hContext);
-    CUdeviceptr Ptr = 0;
     auto HostPtr = pProperties ? pProperties->pHost : nullptr;
-
     BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic;
 
     if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) {
-      UR_CHECK_ERROR(
-          cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
-      UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0));
       AllocMode = BufferMem::AllocMode::UseHostPtr;
     } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
-      UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size));
-      UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0));
       AllocMode = BufferMem::AllocMode::AllocHostPtr;
-    } else {
-      UR_CHECK_ERROR(cuMemAlloc(&Ptr, size));
-      if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
-        AllocMode = BufferMem::AllocMode::CopyIn;
-      }
+    } else if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
+      AllocMode = BufferMem::AllocMode::CopyIn;
     }
 
-    ur_mem_handle_t parentBuffer = nullptr;
+    auto URMemObj = std::unique_ptr<ur_mem_handle_t_>(
+        new ur_mem_handle_t_{hContext, flags, AllocMode, HostPtr, size});
+    if (URMemObj == nullptr) {
+      return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
 
-    auto URMemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_{
-        hContext, parentBuffer, flags, AllocMode, Ptr, HostPtr, size});
-    if (URMemObj != nullptr) {
-      MemObj = URMemObj.release();
-      if (PerformInitialCopy) {
-        // Operates on the default stream of the current CUDA context.
+    // First allocation will be made at urMemBufferCreate if context only
+    // has one device
+    if (PerformInitialCopy && HostPtr) {
+      // Perform initial copy to every device in context
+      for (auto &Device : hContext->getDevices()) {
+        ScopedContext Active(Device);
+        // getPtr may allocate mem if not already allocated
+        const auto &Ptr = std::get<BufferMem>(URMemObj->Mem).getPtr(Device);
         UR_CHECK_ERROR(cuMemcpyHtoD(Ptr, HostPtr, size));
-        // Synchronize with default stream implicitly used by cuMemcpyHtoD
-        // to make buffer data available on device before any other UR call
-        // uses it.
-        CUstream defaultStream = 0;
-        UR_CHECK_ERROR(cuStreamSynchronize(defaultStream));
       }
-    } else {
-      Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
     }
+    MemObj = URMemObj.release();
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   } catch (...) {
-    Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
 
   *phBuffer = MemObj;
 
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
@@ -117,26 +105,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
       return UR_RESULT_SUCCESS;
     }
 
-    ScopedContext Active(MemObjPtr->getContext());
-
-    if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) {
-      auto &BufferImpl = std::get<BufferMem>(MemObjPtr->Mem);
-      switch (BufferImpl.MemAllocMode) {
-      case BufferMem::AllocMode::CopyIn:
-      case BufferMem::AllocMode::Classic:
-        UR_CHECK_ERROR(cuMemFree(BufferImpl.Ptr));
-        break;
-      case BufferMem::AllocMode::UseHostPtr:
-        UR_CHECK_ERROR(cuMemHostUnregister(BufferImpl.HostPtr));
-        break;
-      case BufferMem::AllocMode::AllocHostPtr:
-        UR_CHECK_ERROR(cuMemFreeHost(BufferImpl.HostPtr));
-      };
-    } else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) {
-      auto &SurfaceImpl = std::get<SurfaceMem>(MemObjPtr->Mem);
-      UR_CHECK_ERROR(cuSurfObjectDestroy(SurfaceImpl.getSurface()));
-      UR_CHECK_ERROR(cuArrayDestroy(SurfaceImpl.getArray()));
-    }
+    UR_CHECK_ERROR(hMem->clear());
 
   } catch (ur_result_t Err) {
     Result = Err;
@@ -161,10 +130,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
 /// \param[out] phNativeMem Set to the native handle of the UR mem object.
 ///
 /// \return UR_RESULT_SUCCESS
-UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(
-    ur_mem_handle_t hMem, ur_device_handle_t, ur_native_handle_t *phNativeMem) {
-  *phNativeMem = reinterpret_cast<ur_native_handle_t>(
-      std::get<BufferMem>(hMem->Mem).get());
+UR_APIEXPORT ur_result_t UR_APICALL
+urMemGetNativeHandle(ur_mem_handle_t hMem, ur_device_handle_t Device,
+                     ur_native_handle_t *phNativeMem) {
+  try {
+    *phNativeMem = reinterpret_cast<ur_native_handle_t>(
+        std::get<BufferMem>(hMem->Mem).getPtr(Device));
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
   return UR_RESULT_SUCCESS;
 }
 
@@ -177,14 +153,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory,
 
   UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet);
 
-  ScopedContext Active(hMemory->getContext());
+  // Any device in context will do
+  auto Device = hMemory->getContext()->getDevices()[0];
+  ScopedContext Active(Device);
 
   switch (MemInfoType) {
   case UR_MEM_INFO_SIZE: {
     try {
       size_t AllocSize = 0;
       UR_CHECK_ERROR(cuMemGetAddressRange(
-          nullptr, &AllocSize, std::get<BufferMem>(hMemory->Mem).Ptr));
+          nullptr, &AllocSize,
+          std::get<BufferMem>(hMemory->Mem).getPtr(Device)));
       return ReturnValue(AllocSize);
     } catch (ur_result_t Err) {
       return Err;
@@ -242,160 +221,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
               UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
   }
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
-
   // We only support RBGA channel order
   // TODO: check SYCL CTS and spec. May also have to support BGRA
   UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA,
             UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
 
-  // We have to use cuArray3DCreate, which has some caveats. The height and
-  // depth parameters must be set to 0 produce 1D or 2D arrays. pImageDesc gives
-  // a minimum value of 1, so we need to convert the answer.
-  CUDA_ARRAY3D_DESCRIPTOR ArrayDesc;
-  ArrayDesc.NumChannels = 4; // Only support 4 channel image
-  ArrayDesc.Flags = 0;       // No flags required
-  ArrayDesc.Width = pImageDesc->width;
-  if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
-    ArrayDesc.Height = 0;
-    ArrayDesc.Depth = 0;
-  } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
-    ArrayDesc.Height = pImageDesc->height;
-    ArrayDesc.Depth = 0;
-  } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
-    ArrayDesc.Height = pImageDesc->height;
-    ArrayDesc.Depth = pImageDesc->depth;
-  }
-
-  // We need to get this now in bytes for calculating the total image size later
-  size_t PixelTypeSizeBytes;
-
-  switch (pImageFormat->channelType) {
-  case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8:
-  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
-    ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
-    PixelTypeSizeBytes = 1;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
-    ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8;
-    PixelTypeSizeBytes = 1;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16:
-  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
-    ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
-    PixelTypeSizeBytes = 2;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
-    ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16;
-    PixelTypeSizeBytes = 2;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
-    ArrayDesc.Format = CU_AD_FORMAT_HALF;
-    PixelTypeSizeBytes = 2;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
-    ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
-    PixelTypeSizeBytes = 4;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
-    ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32;
-    PixelTypeSizeBytes = 4;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_FLOAT:
-    ArrayDesc.Format = CU_AD_FORMAT_FLOAT;
-    PixelTypeSizeBytes = 4;
-    break;
-  default:
-    detail::ur::die(
-        "urMemImageCreate given unsupported image_channel_data_type");
-  }
-
-  // When a dimension isn't used pImageDesc has the size set to 1
-  size_t PixelSizeBytes =
-      PixelTypeSizeBytes * 4; // 4 is the only number of channels we support
-  size_t ImageSizeBytes = PixelSizeBytes * pImageDesc->width *
-                          pImageDesc->height * pImageDesc->depth;
-
-  ScopedContext Active(hContext);
-  CUarray ImageArray = nullptr;
-  try {
-    UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &ArrayDesc));
-  } catch (ur_result_t Err) {
-    if (Err == UR_RESULT_ERROR_INVALID_VALUE) {
-      return UR_RESULT_ERROR_INVALID_IMAGE_SIZE;
-    }
-    return Err;
-  } catch (...) {
-    return UR_RESULT_ERROR_UNKNOWN;
-  }
+  auto URMemObj = std::unique_ptr<ur_mem_handle_t_>(
+      new ur_mem_handle_t_{hContext, flags, *pImageFormat, *pImageDesc, pHost});
 
   try {
     if (PerformInitialCopy) {
-      // We have to use a different copy function for each image dimensionality
-      if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
-        UR_CHECK_ERROR(cuMemcpyHtoA(ImageArray, 0, pHost, ImageSizeBytes));
-      } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
-        CUDA_MEMCPY2D CpyDesc;
-        memset(&CpyDesc, 0, sizeof(CpyDesc));
-        CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
-        CpyDesc.srcHost = pHost;
-        CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
-        CpyDesc.dstArray = ImageArray;
-        CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width;
-        CpyDesc.Height = pImageDesc->height;
-        UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc));
-      } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
-        CUDA_MEMCPY3D CpyDesc;
-        memset(&CpyDesc, 0, sizeof(CpyDesc));
-        CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
-        CpyDesc.srcHost = pHost;
-        CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
-        CpyDesc.dstArray = ImageArray;
-        CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width;
-        CpyDesc.Height = pImageDesc->height;
-        CpyDesc.Depth = pImageDesc->depth;
-        UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc));
+      for (const auto &Device : hContext->getDevices()) {
+        UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device));
       }
     }
 
-    // CUDA_RESOURCE_DESC is a union of different structs, shown here
-    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
-    // We need to fill it as described here to use it for a surface or texture
-    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
-    // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
-    // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array
-    // handle.
-    // CUDA_RESOURCE_DESC::flags must be set to zero
-
-    CUDA_RESOURCE_DESC ImageResDesc;
-    ImageResDesc.res.array.hArray = ImageArray;
-    ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-    ImageResDesc.flags = 0;
-
-    CUsurfObject Surface;
-    UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
-
-    auto MemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_(
-        hContext, ImageArray, Surface, flags, pImageDesc->type, phMem));
-
-    if (MemObj == nullptr) {
+    if (URMemObj == nullptr) {
       return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
     }
 
-    *phMem = MemObj.release();
+    *phMem = URMemObj.release();
   } catch (ur_result_t Err) {
-    if (ImageArray) {
-      cuArrayDestroy(ImageArray);
-    }
+    (*phMem)->clear();
     return Err;
   } catch (...) {
-    if (ImageArray) {
-      cuArrayDestroy(ImageArray);
-    }
+    (*phMem)->clear();
     return UR_RESULT_ERROR_UNKNOWN;
   }
-
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory,
@@ -407,14 +260,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory,
 
   auto Context = hMemory->getContext();
 
-  ScopedContext Active(Context);
+  // Any device will do
+  auto Device = Context->getDevices()[0];
+  ScopedContext Active(Device);
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
   try {
     CUDA_ARRAY3D_DESCRIPTOR ArrayInfo;
 
     UR_CHECK_ERROR(cuArray3DGetDescriptor(
-        &ArrayInfo, std::get<SurfaceMem>(hMemory->Mem).getArray()));
+        &ArrayInfo, std::get<SurfaceMem>(hMemory->Mem).getArray(Device)));
 
     const auto cuda2urFormat = [](CUarray_format CUFormat,
                                   ur_image_channel_type_t *ChannelType) {
@@ -544,27 +399,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
   UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
 
   auto &BufferImpl = std::get<BufferMem>(hBuffer->Mem);
-
-  assert((pRegion->origin <= (pRegion->origin + pRegion->size)) && "Overflow");
   UR_ASSERT(((pRegion->origin + pRegion->size) <= BufferImpl.getSize()),
             UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
-  // Retained indirectly due to retaining parent buffer below.
-  ur_context_handle_t Context = hBuffer->Context;
-
-  BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic;
 
-  assert(BufferImpl.Ptr != BufferMem::native_type{0});
-  BufferMem::native_type Ptr = BufferImpl.Ptr + pRegion->origin;
-
-  void *HostPtr = nullptr;
-  if (BufferImpl.HostPtr) {
-    HostPtr = static_cast<char *>(BufferImpl.HostPtr) + pRegion->origin;
-  }
-
-  std::unique_ptr<ur_mem_handle_t_> MemObj{nullptr};
+  std::unique_ptr<ur_mem_handle_t_> RetMemObj{nullptr};
   try {
-    MemObj = std::unique_ptr<ur_mem_handle_t_>{new ur_mem_handle_t_{
-        Context, hBuffer, flags, AllocMode, Ptr, HostPtr, pRegion->size}};
+    for (auto Device : hBuffer->Context->getDevices()) {
+      BufferImpl.getPtr(
+          Device); // This is allocating a dev ptr behind the scenes
+                   // which is necessary before SubBuffer partition
+    }
+    RetMemObj = std::unique_ptr<ur_mem_handle_t_>{
+        new ur_mem_handle_t_{hBuffer, pRegion->origin}};
   } catch (ur_result_t Err) {
     *phMem = nullptr;
     return Err;
@@ -573,6 +419,173 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
   }
 
-  *phMem = MemObj.release();
+  *phMem = RetMemObj.release();
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
+                                           const ur_device_handle_t hDevice) {
+  ScopedContext Active(hDevice);
+  ur_lock LockGuard(Mem->MemoryAllocationMutex);
+
+  if (Mem->isBuffer()) {
+    auto &Buffer = std::get<BufferMem>(Mem->Mem);
+    auto &DevPtr = Buffer.Ptrs[hDevice->getIndex()];
+
+    // Allocation has already been made
+    if (DevPtr != BufferMem::native_type{0}) {
+      return UR_RESULT_SUCCESS;
+    }
+
+    if (Buffer.MemAllocMode == BufferMem::AllocMode::AllocHostPtr) {
+      // Host allocation has already been made
+      UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0));
+    } else if (Buffer.MemAllocMode == BufferMem::AllocMode::UseHostPtr) {
+      UR_CHECK_ERROR(cuMemHostRegister(Buffer.HostPtr, Buffer.Size,
+                                       CU_MEMHOSTALLOC_DEVICEMAP));
+      UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0));
+    } else {
+      UR_CHECK_ERROR(cuMemAlloc(&DevPtr, Buffer.Size));
+    }
+  } else {
+    CUarray ImageArray;
+    CUsurfObject Surface;
+    try {
+      auto &Image = std::get<SurfaceMem>(Mem->Mem);
+      // Allocation has already been made
+      if (Image.Arrays[hDevice->getIndex()]) {
+        return UR_RESULT_SUCCESS;
+      }
+      UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &Image.ArrayDesc));
+      Image.Arrays[hDevice->getIndex()] = ImageArray;
+      // HIP_RESOURCE_DESC is a union of different structs, shown here
+      // We need to fill it as described here to use it for a surface or texture
+      // HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and
+      // HIP_RESOURCE_DESC::res::array::hArray must be set to a valid HIP array
+      // handle.
+      // HIP_RESOURCE_DESC::flags must be set to zero
+      CUDA_RESOURCE_DESC ImageResDesc;
+      ImageResDesc.res.array.hArray = ImageArray;
+      ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+
+      UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
+      Image.SurfObjs[hDevice->getIndex()] = Surface;
+    } catch (ur_result_t Err) {
+      if (ImageArray) {
+        UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
+      }
+      return Err;
+    } catch (...) {
+      if (ImageArray) {
+        UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
+      }
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+namespace {
+ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
+                                  ur_device_handle_t hDevice) {
+  auto &Buffer = std::get<BufferMem>(Mem->Mem);
+  if (Mem->LastEventWritingToMemObj == nullptr) {
+    // Device allocation being initialized from host for the first time
+    if (Buffer.HostPtr) {
+      UR_CHECK_ERROR(
+          cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
+    }
+  } else if (Mem->LastDeviceWritingToMemObj != hDevice) {
+    UR_CHECK_ERROR(cuMemcpyDtoD(Buffer.getPtr(hDevice),
+                                Buffer.getPtr(Mem->LastDeviceWritingToMemObj),
+                                Buffer.Size));
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
+                                 ur_device_handle_t hDevice) {
+  auto &Image = std::get<SurfaceMem>(Mem->Mem);
+  // When a dimension isn't used image_desc has the size set to 1
+  size_t PixelSizeBytes = Image.PixelTypeSizeBytes *
+                          4; // 4 is the only number of channels we support
+  size_t ImageSizeBytes = PixelSizeBytes * Image.ImageDesc.width *
+                          Image.ImageDesc.height * Image.ImageDesc.depth;
+
+  CUarray ImageArray = Image.getArray(hDevice);
+
+  CUDA_MEMCPY2D CpyDesc2D;
+  CUDA_MEMCPY3D CpyDesc3D;
+  // We have to use a different copy function for each image
+  // dimensionality
+  if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+    memset(&CpyDesc2D, 0, sizeof(CpyDesc2D));
+    CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+    CpyDesc2D.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
+    CpyDesc2D.dstArray = ImageArray;
+    CpyDesc2D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width;
+    CpyDesc2D.Height = Image.ImageDesc.height;
+    UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+  } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+    memset(&CpyDesc3D, 0, sizeof(CpyDesc3D));
+    CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+    CpyDesc3D.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
+    CpyDesc3D.dstArray = ImageArray;
+    CpyDesc3D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width;
+    CpyDesc3D.Height = Image.ImageDesc.height;
+    CpyDesc3D.Depth = Image.ImageDesc.depth;
+    UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+  }
+
+  if (Mem->LastEventWritingToMemObj == nullptr) {
+    if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      UR_CHECK_ERROR(
+          cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
+    } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+      CpyDesc2D.srcHost = Image.HostPtr;
+      UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+    } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+      CpyDesc3D.srcHost = Image.HostPtr;
+      UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+    }
+  } else if (Mem->LastDeviceWritingToMemObj != hDevice) {
+    if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      // FIXME: 1D memcpy from DtoD going through the host.
+      UR_CHECK_ERROR(cuMemcpyAtoH(
+          Image.HostPtr, Image.getArray(Mem->LastDeviceWritingToMemObj),
+          0 /*srcOffset*/, ImageSizeBytes));
+      UR_CHECK_ERROR(
+          cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
+    } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+      CpyDesc2D.srcArray = Image.getArray(Mem->LastDeviceWritingToMemObj);
+      UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+    } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+      CpyDesc3D.srcArray = Image.getArray(Mem->LastDeviceWritingToMemObj);
+      UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+    }
+  }
+  return UR_RESULT_SUCCESS;
+}
+} // namespace
+
+// If calling this entry point it is necessary to lock the memoryMigrationMutex
+// beforehand
+ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
+                                          const ur_device_handle_t hDevice) {
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  // Device allocation has already been initialized with most up to date
+  // data in buffer
+  if (Mem->HaveMigratedToDeviceSinceLastWrite[hDevice->getIndex()]) {
+    return UR_RESULT_SUCCESS;
+  }
+
+  ScopedContext Active(hDevice);
+  if (Mem->isBuffer()) {
+    UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice));
+  } else {
+    UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
+  }
+
+  Mem->HaveMigratedToDeviceSinceLastWrite[hDevice->getIndex()] = true;
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp
index 3f09552038..9dcbe23a22 100644
--- a/source/adapters/cuda/memory.hpp
+++ b/source/adapters/cuda/memory.hpp
@@ -11,10 +11,18 @@
 
 #include <cassert>
 #include <cuda.h>
+#include <memory>
 #include <ur_api.h>
 #include <variant>
 
 #include "common.hpp"
+#include "context.hpp"
+#include "device.hpp"
+
+ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+                                           const ur_device_handle_t);
+ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
+                                          const ur_device_handle_t);
 
 // Handler for plain, pointer-based CUDA allocations
 struct BufferMem {
@@ -27,7 +35,7 @@ struct BufferMem {
     /// Original flags for the mapped region
     ur_map_flags_t MapFlags;
     /// Allocated host memory used exclusively for this map.
-    std::unique_ptr<unsigned char[]> MapMem;
+    std::shared_ptr<unsigned char[]> MapMem;
 
     BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags)
         : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags),
@@ -61,11 +69,16 @@ struct BufferMem {
 
   using native_type = CUdeviceptr;
 
+private:
+  /// CUDA handler for the pointer
+  std::vector<native_type> Ptrs;
+
+public:
   /// If this allocation is a sub-buffer (i.e., a view on an existing
   /// allocation), this is the pointer to the parent handler structure
-  ur_mem_handle_t Parent;
-  /// CUDA handler for the pointer
-  native_type Ptr;
+  ur_mem_handle_t Parent = nullptr;
+  /// Outer UR mem holding this BufferMem in variant
+  ur_mem_handle_t OuterMemStruct;
   /// Pointer associated with this device on the host
   void *HostPtr;
   /// Size of the allocation in bytes
@@ -75,12 +88,33 @@ struct BufferMem {
 
   AllocMode MemAllocMode;
 
-  BufferMem(ur_mem_handle_t Parent, BufferMem::AllocMode Mode, CUdeviceptr Ptr,
-            void *HostPtr, size_t Size)
-      : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size},
-        PtrToBufferMap{}, MemAllocMode{Mode} {};
+  BufferMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct,
+            AllocMode Mode, void *HostPtr, size_t Size)
+      : Ptrs(Context->getDevices().size(), native_type{0}),
+        OuterMemStruct{OuterMemStruct}, HostPtr{HostPtr}, Size{Size},
+        MemAllocMode{Mode} {};
+
+  BufferMem(const BufferMem &Buffer) = default;
+
+  native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) {
+    if (ur_result_t Err =
+            allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+        Err != UR_RESULT_SUCCESS) {
+      throw Err;
+    }
+    return reinterpret_cast<native_type>(
+        reinterpret_cast<uint8_t *>(Ptrs[Device->getIndex()]) + Offset);
+  }
+
+  native_type getPtr(const ur_device_handle_t Device) {
+    return getPtrWithOffset(Device, 0);
+  }
+
+  void *getVoid(const ur_device_handle_t Device) {
+    return reinterpret_cast<void *>(getPtrWithOffset(Device, 0));
+  }
 
-  native_type get() const noexcept { return Ptr; }
+  bool isSubBuffer() const noexcept { return Parent != nullptr; }
 
   size_t getSize() const noexcept { return Size; }
 
@@ -120,104 +154,319 @@ struct BufferMem {
     assert(MapPtr != nullptr);
     PtrToBufferMap.erase(MapPtr);
   }
-};
 
-// Handler data for surface object (i.e. Images)
-struct SurfaceMem {
-  CUarray Array;
-  CUsurfObject SurfObj;
-  ur_mem_type_t ImageType;
+  ur_result_t clear() {
+    if (Parent != nullptr) {
+      return UR_RESULT_SUCCESS;
+    }
 
-  SurfaceMem(CUarray Array, CUsurfObject Surf, ur_mem_type_t ImageType,
-             void *HostPtr)
-      : Array{Array}, SurfObj{Surf}, ImageType{ImageType} {
-    (void)HostPtr;
+    switch (MemAllocMode) {
+    case AllocMode::CopyIn:
+    case AllocMode::Classic:
+      for (auto &DevPtr : Ptrs) {
+        if (DevPtr != native_type{0}) {
+          UR_CHECK_ERROR(cuMemFree(DevPtr));
+        }
+      }
+      break;
+    case AllocMode::UseHostPtr:
+      UR_CHECK_ERROR(cuMemHostUnregister(HostPtr));
+      break;
+    case AllocMode::AllocHostPtr:
+      UR_CHECK_ERROR(cuMemFreeHost(HostPtr));
+    }
+    return UR_RESULT_SUCCESS;
   }
 
-  CUarray getArray() const noexcept { return Array; }
+  friend struct ur_mem_handle_t_;
+  friend ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+                                                    const ur_device_handle_t);
+};
 
-  CUsurfObject getSurface() const noexcept { return SurfObj; }
+// Handler data for surface object (i.e. Images)
+struct SurfaceMem {
+private:
+  std::vector<CUarray> Arrays;
+  std::vector<CUsurfObject> SurfObjs;
 
-  ur_mem_type_t getImageType() const noexcept { return ImageType; }
-};
+public:
+  ur_mem_handle_t OuterMemStruct;
 
-// For sampled/unsampled images
-struct ImageMem {
-  CUarray Array;
-  void *Handle;
-  ur_mem_type_t ImageType;
-  ur_sampler_handle_t Sampler;
+  ur_image_format_t ImageFormat;
+  ur_image_desc_t ImageDesc;
+  CUDA_ARRAY3D_DESCRIPTOR ArrayDesc;
+  size_t PixelTypeSizeBytes;
+  void *HostPtr;
 
-  ImageMem(CUarray Array, void *Handle, ur_mem_type_t ImageType,
-           ur_sampler_handle_t Sampler)
-      : Array{Array}, Handle{Handle}, ImageType{ImageType}, Sampler{Sampler} {};
+  SurfaceMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct,
+             ur_image_format_t ImageFormat, ur_image_desc_t ImageDesc,
+             void *HostPtr)
+      : Arrays(Context->Devices.size(), CUarray{0}),
+        SurfObjs(Context->Devices.size(), CUsurfObject{0}),
+        OuterMemStruct{OuterMemStruct},
+        ImageFormat{ImageFormat}, ImageDesc{ImageDesc}, HostPtr{HostPtr} {
+    // We have to use hipArray3DCreate, which has some caveats. The height and
+    // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc
+    // gives a minimum value of 1, so we need to convert the answer.
+    ArrayDesc.NumChannels = 4; // Only support 4 channel image
+    ArrayDesc.Flags = 0;       // No flags required
+    ArrayDesc.Width = ImageDesc.width;
+    if (ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      ArrayDesc.Height = 0;
+      ArrayDesc.Depth = 0;
+    } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+      ArrayDesc.Height = ImageDesc.height;
+      ArrayDesc.Depth = 0;
+    } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+      ArrayDesc.Height = ImageDesc.height;
+      ArrayDesc.Depth = ImageDesc.depth;
+    }
 
-  CUarray get_array() const noexcept { return Array; }
+    // We need to get PixelTypeSizeBytes for calculating the total image size
+    // later
+    switch (ImageFormat.channelType) {
+
+    case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8:
+    case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
+      ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+      PixelTypeSizeBytes = 1;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
+      ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8;
+      PixelTypeSizeBytes = 1;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16:
+    case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
+      ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
+      PixelTypeSizeBytes = 2;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
+      ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16;
+      PixelTypeSizeBytes = 2;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
+      ArrayDesc.Format = CU_AD_FORMAT_HALF;
+      PixelTypeSizeBytes = 2;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
+      ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
+      PixelTypeSizeBytes = 4;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
+      ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32;
+      PixelTypeSizeBytes = 4;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_FLOAT:
+      ArrayDesc.Format = CU_AD_FORMAT_FLOAT;
+      PixelTypeSizeBytes = 4;
+      break;
+    default:
+      detail::ur::die(
+          "urMemImageCreate given unsupported image_channel_data_type");
+    }
+  }
 
-  void *get_handle() const noexcept { return Handle; }
+  // Will allocate a new array on device if not already allocated
+  CUarray getArray(const ur_device_handle_t Device) {
+    if (ur_result_t Err =
+            allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+        Err != UR_RESULT_SUCCESS) {
+      throw Err;
+    }
+    return Arrays[Device->getIndex()];
+  }
+  // Will allocate a new surface on device if not already allocated
+  CUsurfObject getSurface(const ur_device_handle_t Device) {
+    if (ur_result_t Err =
+            allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+        Err != UR_RESULT_SUCCESS) {
+      throw Err;
+    }
+    return SurfObjs[Device->getIndex()];
+  }
 
-  ur_mem_type_t get_image_type() const noexcept { return ImageType; }
+  ur_mem_type_t getType() { return ImageDesc.type; }
 
-  ur_sampler_handle_t get_sampler() const noexcept { return Sampler; }
+  ur_result_t clear() {
+    for (auto Array : Arrays) {
+      if (Array) {
+        UR_CHECK_ERROR(cuArrayDestroy(Array));
+      }
+    }
+    for (auto Surf : SurfObjs) {
+      if (Surf != CUsurfObject{0}) {
+        UR_CHECK_ERROR(cuSurfObjectDestroy(Surf));
+      }
+    }
+    return UR_RESULT_SUCCESS;
+  }
+  friend ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+                                                    const ur_device_handle_t);
 };
 
 /// UR Mem mapping to CUDA memory allocations, both data and texture/surface.
 /// \brief Represents non-SVM allocations on the CUDA backend.
 /// Keeps tracks of all mapped regions used for Map/Unmap calls.
 /// Only one region can be active at the same time per allocation.
+///
+/// The ur_mem_handle_t is responsible for memory allocation and migration
+/// across devices in the same ur_context_handle_t. If a kernel writes to a
+/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all
+/// subsequent operations that want to read from the ur_mem_handle_t must wait
+/// on the event referring to the last write.
+///
+/// Since urMemBufferCreate/urMemImageCreate do not take a queue or device
+/// object, only a ur_context_handle_t, at mem obj creation we don't know which
+/// device we must make a native image/allocation on. Therefore no allocations
+/// are made at urMemBufferCreate/urMemImageCreate. Instead device
+/// images/allocations are made lazily. These allocations are made implicitly
+/// with a call to getPtr/getArray which will allocate a new allocation/image on
+/// device if need be.
+///
+/// Memory migration between native allocations for devices in the same
+/// ur_context_handle_t will occur at:
+///
+///   1. urEnqueueKernelLaunch
+///   2. urEnqueueMem(Buffer|Image)Read(Rect)
+///
+/// Migrations will occur in both cases if the most recent version of data
+/// is on a different device, marked by LastDeviceWritingToMemObj
+///
+/// Example trace:
+/// ~~~~~~~~~~~~~~
+///
+/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1]
+///             -> OUT: hContext
+///
+/// =====> urMemBufferCreate(hContext,...);
+///             -> No native allocations made
+///             -> OUT: hBuffer
+///
+/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...);
+///             -> Allocation made on q0 ie device0
+///             -> New allocation initialized with host data.
+///
+/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...);
+///             -> ur_kernel_handle_t associated with a ur_program_handle_t,
+///                which is in turn unique to a device. So we can set the kernel
+///                arg with the ptr of the device specific allocation.
+///             -> hKernel0->getProgram()->getDevice() == device0
+///             -> allocateMemObjOnDeviceIfNeeded(device0);
+///                   -> Native allocation already made on device0, continue.
+///
+/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
+///             -> Suppose that hKernel0 writes to hBuffer.
+///             -> Call hBuffer->setLastEventWritingToMemObj with return event
+///                from this operation
+///             -> Enqueue native kernel launch
+///
+/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...);
+///             -> hKernel1->getProgram()->getDevice() == device1
+///             -> New allocation will be made on device1 when calling
+///                getPtr(device1)
+///                   -> No native allocation on device1
+///                   -> Make native allocation on device1
+///
+/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...);
+///             -> Suppose hKernel1 wants to read from hBuffer and not write.
+///             -> migrateMemoryToDeviceIfNeeded(device1);
+///                   -> hBuffer->LastEventWritingToMemObj is not nullptr
+///                   -> Check if memory has been migrated to device1 since the
+///                      last write
+///                        -> Hasn't been migrated
+///                   -> Wait on LastEventWritingToMemObj.
+///                   -> Migrate memory from device0's native allocation to
+///                      device1's native allocation.
+///             -> Enqueue native kernel launch
+///
+/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
+///             -> migrateMemoryToDeviceIfNeeded(device0);
+///                   -> hBuffer->LastEventWritingToMemObj refers to an event
+///                      from q0
+///                        -> Migration not necessary
+///             -> Enqueue native kernel launch
+///
 struct ur_mem_handle_t_ {
   // Context where the memory object is accessible
   ur_context_handle_t Context;
 
   /// Reference counting of the handler
   std::atomic_uint32_t RefCount;
-  enum class Type { Buffer, Surface, Texture } MemType;
 
   // Original mem flags passed
   ur_mem_flags_t MemFlags;
 
+  // If we make a ur_mem_handle_t_ from a native allocation, it can be useful to
+  // associate it with the device that holds the native allocation.
+  ur_device_handle_t DeviceWithNativeAllocation{nullptr};
+
+  // Has the memory been migrated to a device since the last write?
+  std::vector<bool> HaveMigratedToDeviceSinceLastWrite;
+
+  // We should wait on this event prior to migrating memory across allocations
+  // in this ur_mem_handle_t_
+  ur_event_handle_t LastEventWritingToMemObj{nullptr};
+
+  // Since the event may not contain device info (if using interop, which
+  // doesn't take a queue) we should use this member var to keep track of which
+  // device has most recent view of data
+  ur_device_handle_t LastDeviceWritingToMemObj{nullptr};
+
+  // Enumerates all possible types of accesses.
+  enum access_mode_t { unknown, read_write, read_only, write_only };
+
+  ur_mutex MemoryAllocationMutex; // A mutex for allocations
+  ur_mutex MemoryMigrationMutex;  // A mutex for memory transfers
+
   /// A UR Memory object represents either plain memory allocations ("Buffers"
   /// in OpenCL) or typed allocations ("Images" in OpenCL).
   /// In CUDA their API handlers are different. Whereas "Buffers" are allocated
   /// as pointer-like structs, "Images" are stored in Textures or Surfaces.
-  /// This union allows implementation to use either from the same handler.
-  std::variant<BufferMem, SurfaceMem, ImageMem> Mem;
+  /// This variant allows implementation to use either from the same handler.
+  std::variant<BufferMem, SurfaceMem> Mem;
 
   /// Constructs the UR mem handler for a non-typed allocation ("buffer")
-  ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent,
-                   ur_mem_flags_t MemFlags, BufferMem::AllocMode Mode,
-                   CUdeviceptr Ptr, void *HostPtr, size_t Size)
-      : Context{Context}, RefCount{1}, MemType{Type::Buffer},
-        MemFlags{MemFlags}, Mem{BufferMem{Parent, Mode, Ptr, HostPtr, Size}} {
-    if (isSubBuffer()) {
-      urMemRetain(std::get<BufferMem>(Mem).Parent);
-    } else {
-      urContextRetain(Context);
-    }
-  };
-
-  /// Constructs the UR allocation for an Image object (surface in CUDA)
-  ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array,
-                   CUsurfObject Surf, ur_mem_flags_t MemFlags,
-                   ur_mem_type_t ImageType, void *HostPtr)
-      : Context{Context}, RefCount{1}, MemType{Type::Surface},
-        MemFlags{MemFlags}, Mem{SurfaceMem{Array, Surf, ImageType, HostPtr}} {
+  ur_mem_handle_t_(ur_context_handle_t Ctxt, ur_mem_flags_t MemFlags,
+                   BufferMem::AllocMode Mode, void *HostPtr, size_t Size)
+      : Context{Ctxt}, RefCount{1}, MemFlags{MemFlags},
+        HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false),
+        Mem{std::in_place_type<BufferMem>, Ctxt, this, Mode, HostPtr, Size} {
     urContextRetain(Context);
-  }
+  };
 
-  /// Constructs the UR allocation for an unsampled image object
-  ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array,
-                   CUsurfObject Surf, ur_mem_type_t ImageType)
-      : Context{Context}, RefCount{1}, MemType{Type::Surface}, MemFlags{0},
-        Mem{ImageMem{Array, (void *)Surf, ImageType, nullptr}} {
-    urContextRetain(Context);
-  }
+  // Subbuffer constructor
+  ur_mem_handle_t_(ur_mem_handle_t Parent, size_t SubBufferOffset)
+      : Context{Parent->Context}, RefCount{1}, MemFlags{Parent->MemFlags},
+        HaveMigratedToDeviceSinceLastWrite(Parent->Context->Devices.size(),
+                                           false),
+        Mem{BufferMem{std::get<BufferMem>(Parent->Mem)}} {
+    auto &SubBuffer = std::get<BufferMem>(Mem);
+    SubBuffer.Parent = Parent;
+    SubBuffer.OuterMemStruct = this;
+    if (SubBuffer.HostPtr) {
+      SubBuffer.HostPtr =
+          static_cast<char *>(SubBuffer.HostPtr) + SubBufferOffset;
+    }
+    for (auto &DevPtr : SubBuffer.Ptrs) {
+      if (DevPtr) {
+        DevPtr += SubBufferOffset;
+      }
+    }
+    urMemRetain(Parent);
+  };
 
-  /// Constructs the UR allocation for a sampled image object
-  ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, CUtexObject Tex,
-                   ur_sampler_handle_t Sampler, ur_mem_type_t ImageType)
-      : Context{Context}, RefCount{1}, MemType{Type::Texture}, MemFlags{0},
-        Mem{ImageMem{Array, (void *)Tex, ImageType, Sampler}} {
+  /// Constructs the UR mem handler for an Image object
+  ur_mem_handle_t_(ur_context_handle_t Ctxt, ur_mem_flags_t MemFlags,
+                   ur_image_format_t ImageFormat, ur_image_desc_t ImageDesc,
+                   void *HostPtr)
+      : Context{Ctxt}, RefCount{1}, MemFlags{MemFlags},
+        HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false),
+        Mem{std::in_place_type<SurfaceMem>,
+            Ctxt,
+            this,
+            ImageFormat,
+            ImageDesc,
+            HostPtr} {
     urContextRetain(Context);
   }
 
@@ -229,13 +478,24 @@ struct ur_mem_handle_t_ {
     urContextRelease(Context);
   }
 
-  bool isBuffer() const noexcept { return MemType == Type::Buffer; }
+  bool isBuffer() const noexcept {
+    return std::holds_alternative<BufferMem>(Mem);
+  }
 
   bool isSubBuffer() const noexcept {
     return (isBuffer() && (std::get<BufferMem>(Mem).Parent != nullptr));
   }
 
-  bool isImage() const noexcept { return MemType == Type::Surface; }
+  bool isImage() const noexcept {
+    return std::holds_alternative<SurfaceMem>(Mem);
+  }
+
+  ur_result_t clear() {
+    if (isBuffer()) {
+      return std::get<BufferMem>(Mem).clear();
+    }
+    return std::get<SurfaceMem>(Mem).clear();
+  }
 
   ur_context_handle_t getContext() const noexcept { return Context; }
 
@@ -244,4 +504,24 @@ struct ur_mem_handle_t_ {
   uint32_t decrementReferenceCount() noexcept { return --RefCount; }
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
+
+  void setLastEventWritingToMemObj(ur_event_handle_t NewEvent,
+                                   ur_device_handle_t RecentDevice) {
+    assert(NewEvent && "Invalid event!");
+    // This entry point should only ever be called when using multi device ctx
+    assert(Context->Devices.size() > 1);
+    urEventRetain(NewEvent);
+    urDeviceRetain(RecentDevice);
+    if (LastEventWritingToMemObj != nullptr) {
+      urEventRelease(LastEventWritingToMemObj);
+    }
+    if (LastDeviceWritingToMemObj != nullptr) {
+      urDeviceRelease(LastDeviceWritingToMemObj);
+    }
+    LastEventWritingToMemObj = NewEvent;
+    for (const auto &Device : Context->getDevices()) {
+      HaveMigratedToDeviceSinceLastWrite[Device->getIndex()] =
+          Device == RecentDevice;
+    }
+  }
 };
diff --git a/source/adapters/cuda/physical_mem.cpp b/source/adapters/cuda/physical_mem.cpp
index e2c46696a0..c8d4adf0ef 100644
--- a/source/adapters/cuda/physical_mem.cpp
+++ b/source/adapters/cuda/physical_mem.cpp
@@ -23,7 +23,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemCreate(
   CUmemAllocationProp AllocProps = {};
   AllocProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   AllocProps.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  UR_CHECK_ERROR(GetDeviceOrdinal(hDevice, AllocProps.location.id));
+  AllocProps.location.id = hDevice->getIndex();
 
   CUmemGenericAllocationHandle ResHandle;
   switch (auto Result = cuMemCreate(&ResHandle, size, &AllocProps, 0)) {
@@ -32,7 +32,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemCreate(
   default:
     UR_CHECK_ERROR(Result);
   }
-  *phPhysicalMem = new ur_physical_mem_handle_t_(ResHandle, hContext);
+  *phPhysicalMem = new ur_physical_mem_handle_t_(ResHandle, hContext, hDevice);
 
   return UR_RESULT_SUCCESS;
 }
@@ -51,7 +51,7 @@ urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) {
   try {
     std::unique_ptr<ur_physical_mem_handle_t_> PhysicalMemGuard(hPhysicalMem);
 
-    ScopedContext Active(hPhysicalMem->getContext());
+    ScopedContext Active(hPhysicalMem->getDevice());
     UR_CHECK_ERROR(cuMemRelease(hPhysicalMem->get()));
     return UR_RESULT_SUCCESS;
   } catch (ur_result_t err) {
diff --git a/source/adapters/cuda/physical_mem.hpp b/source/adapters/cuda/physical_mem.hpp
index 0ce332e112..c77ad0d547 100644
--- a/source/adapters/cuda/physical_mem.hpp
+++ b/source/adapters/cuda/physical_mem.hpp
@@ -26,41 +26,29 @@ struct ur_physical_mem_handle_t_ {
   std::atomic_uint32_t RefCount;
   native_type PhysicalMem;
   ur_context_handle_t_ *Context;
+  ur_device_handle_t Device;
 
-  ur_physical_mem_handle_t_(native_type PhysMem, ur_context_handle_t_ *Ctx)
-      : RefCount(1), PhysicalMem(PhysMem), Context(Ctx) {
+  ur_physical_mem_handle_t_(native_type PhysMem, ur_context_handle_t_ *Ctx,
+                            ur_device_handle_t Device)
+      : RefCount(1), PhysicalMem(PhysMem), Context(Ctx), Device(Device) {
     urContextRetain(Context);
+    urDeviceRetain(Device);
   }
 
-  ~ur_physical_mem_handle_t_() { urContextRelease(Context); }
+  ~ur_physical_mem_handle_t_() {
+    urContextRelease(Context);
+    urDeviceRelease(Device);
+  }
 
   native_type get() const noexcept { return PhysicalMem; }
 
   ur_context_handle_t_ *getContext() const noexcept { return Context; }
 
+  ur_device_handle_t_ *getDevice() const noexcept { return Device; }
+
   uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
   uint32_t decrementReferenceCount() noexcept { return --RefCount; }
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 };
-
-// Find a device ordinal of a device.
-inline ur_result_t GetDeviceOrdinal(ur_device_handle_t Device, int &Ordinal) {
-  ur_adapter_handle_t AdapterHandle = &adapter;
-  // Get list of platforms
-  uint32_t NumPlatforms;
-  UR_CHECK_ERROR(urPlatformGet(&AdapterHandle, 1, 0, nullptr, &NumPlatforms));
-  UR_ASSERT(NumPlatforms, UR_RESULT_ERROR_UNKNOWN);
-
-  std::vector<ur_platform_handle_t> Platforms{NumPlatforms};
-  UR_CHECK_ERROR(urPlatformGet(&AdapterHandle, 1, NumPlatforms,
-                               Platforms.data(), nullptr));
-
-  // Ordinal corresponds to the platform ID as each device has its own platform.
-  CUdevice NativeDevice = Device->get();
-  for (Ordinal = 0; size_t(Ordinal) < Platforms.size(); ++Ordinal)
-    if (Platforms[Ordinal]->Devices[0]->get() == NativeDevice)
-      return UR_RESULT_SUCCESS;
-  return UR_RESULT_ERROR_INVALID_DEVICE;
-}
diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp
index 4c730f997a..27b94f756f 100644
--- a/source/adapters/cuda/platform.cpp
+++ b/source/adapters/cuda/platform.cpp
@@ -52,9 +52,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
 /// There is only one CUDA platform, and contains all devices on the system.
 /// Triggers the CUDA Driver initialization (cuInit) the first time, so this
 /// must be the first PI API called.
-///
-/// However because multiple devices in a context is not currently supported,
-/// place each device in a separate platform.
 UR_APIEXPORT ur_result_t UR_APICALL
 urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
               ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) {
@@ -62,7 +59,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
   try {
     static std::once_flag InitFlag;
     static uint32_t NumPlatforms = 1;
-    static std::vector<ur_platform_handle_t_> Platforms;
+    static ur_platform_handle_t_ Platform;
 
     UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE);
     UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_SIZE);
@@ -76,39 +73,34 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
           int NumDevices = 0;
           UR_CHECK_ERROR(cuDeviceGetCount(&NumDevices));
           try {
-            // make one platform per device
-            NumPlatforms = NumDevices;
-            Platforms.resize(NumDevices);
-
             for (int i = 0; i < NumDevices; ++i) {
               CUdevice Device;
               UR_CHECK_ERROR(cuDeviceGet(&Device, i));
               CUcontext Context;
               UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Context, Device));
 
-              ScopedContext active(Context);
+              ScopedContext Active(Context); // Set native ctx as active
               CUevent EvBase;
               UR_CHECK_ERROR(cuEventCreate(&EvBase, CU_EVENT_DEFAULT));
 
               // Use default stream to record base event counter
               UR_CHECK_ERROR(cuEventRecord(EvBase, 0));
 
-              Platforms[i].Devices.emplace_back(new ur_device_handle_t_{
-                  Device, Context, EvBase, &Platforms[i]});
+              Platform.Devices.emplace_back(
+                  new ur_device_handle_t_{Device, Context, EvBase, &Platform,
+                                          static_cast<uint32_t>(i)});
             }
           } catch (const std::bad_alloc &) {
             // Signal out-of-memory situation
             for (int i = 0; i < NumDevices; ++i) {
-              Platforms[i].Devices.clear();
+              Platform.Devices.clear();
             }
-            Platforms.clear();
             Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
           } catch (ur_result_t Err) {
             // Clear and rethrow to allow retry
             for (int i = 0; i < NumDevices; ++i) {
-              Platforms[i].Devices.clear();
+              Platform.Devices.clear();
             }
-            Platforms.clear();
             Result = Err;
             throw Err;
           } catch (...) {
@@ -123,9 +115,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
     }
 
     if (phPlatforms != nullptr) {
-      for (unsigned i = 0; i < std::min(NumEntries, NumPlatforms); ++i) {
-        phPlatforms[i] = &Platforms[i];
-      }
+      *phPlatforms = &Platform;
     }
 
     return Result;
diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp
index 3f044c272f..eed17b9116 100644
--- a/source/adapters/cuda/program.cpp
+++ b/source/adapters/cuda/program.cpp
@@ -45,14 +45,6 @@ bool getMaxRegistersJitOptionValue(const std::string &BuildOptions,
   return true;
 }
 
-ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Context)
-    : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
-      Context{Context}, KernelReqdWorkGroupSizeMD{} {
-  urContextRetain(Context);
-}
-
-ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); }
-
 ur_result_t
 ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
                                   size_t Length) {
@@ -189,12 +181,14 @@ ur_result_t createProgram(ur_context_handle_t hContext,
                           const uint8_t *pBinary,
                           const ur_program_properties_t *pProperties,
                           ur_program_handle_t *phProgram) {
-  UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
   UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);
 
   std::unique_ptr<ur_program_handle_t_> RetProgram{
-      new ur_program_handle_t_{hContext}};
+      new ur_program_handle_t_{hContext, hDevice}};
 
   if (pProperties) {
     if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
@@ -214,18 +208,12 @@ ur_result_t createProgram(ur_context_handle_t hContext,
   return UR_RESULT_SUCCESS;
 }
 
-/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
-/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
-/// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
+// A program is unique to a device so this entry point cannot be supported with
+// a multi device context
 UR_APIEXPORT ur_result_t UR_APICALL
-urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
-                      size_t length, const ur_program_properties_t *pProperties,
-                      ur_program_handle_t *phProgram) {
-  ur_device_handle_t hDevice = hContext->getDevice();
-  auto pBinary = reinterpret_cast<const uint8_t *>(pIL);
-
-  return createProgram(hContext, hDevice, length, pBinary, pProperties,
-                       phProgram);
+urProgramCreateWithIL(ur_context_handle_t, const void *, size_t,
+                      const ur_program_properties_t *, ur_program_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
 /// CUDA will handle the PTX/CUBIN binaries internally through a call to
@@ -264,7 +252,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
   ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext Active(hProgram->getContext());
+    ScopedContext Active(hProgram->getDevice());
 
     hProgram->buildProgram(pOptions);
     hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
@@ -289,13 +277,17 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
               const ur_program_handle_t *phPrograms, const char *pOptions,
               ur_program_handle_t *phProgram) {
   ur_result_t Result = UR_RESULT_SUCCESS;
+  // All programs must be associated with the same device
+  for (auto i = 1u; i < count; ++i)
+    UR_ASSERT(phPrograms[i]->getDevice() == phPrograms[0]->getDevice(),
+              UR_RESULT_ERROR_INVALID_DEVICE);
 
   try {
-    ScopedContext Active(hContext);
+    ScopedContext Active(phPrograms[0]->getDevice());
 
     CUlinkState State;
     std::unique_ptr<ur_program_handle_t_> RetProgram{
-        new ur_program_handle_t_{hContext}};
+        new ur_program_handle_t_{hContext, phPrograms[0]->getDevice()}};
 
     UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &State));
     try {
@@ -382,7 +374,8 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
   case UR_PROGRAM_INFO_NUM_DEVICES:
     return ReturnValue(1u);
   case UR_PROGRAM_INFO_DEVICES:
-    return ReturnValue(&hProgram->Context->DeviceID, 1);
+    return ReturnValue(
+        static_cast<uint32_t>(hProgram->getDevice()->getIndex()));
   case UR_PROGRAM_INFO_SOURCE:
     return ReturnValue(hProgram->Binary);
   case UR_PROGRAM_INFO_BINARY_SIZES:
@@ -426,7 +419,7 @@ urProgramRelease(ur_program_handle_t hProgram) {
     ur_result_t Result = UR_RESULT_ERROR_INVALID_PROGRAM;
 
     try {
-      ScopedContext Active(hProgram->getContext());
+      ScopedContext Active(hProgram->getDevice());
       auto cuModule = hProgram->get();
       // "0" is a valid handle for a cuModule, so the best way to check if we
       // actually loaded a module and need to unload it is to look at the build
@@ -483,8 +476,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer(
     ur_device_handle_t hDevice, ur_program_handle_t hProgram,
     const char *pFunctionName, void **ppFunctionPointer) {
   // Check if device passed is the same the device bound to the context
-  UR_ASSERT(hDevice == hProgram->getContext()->getDevice(),
-            UR_RESULT_ERROR_INVALID_DEVICE);
+  UR_ASSERT(hDevice == hProgram->getDevice(), UR_RESULT_ERROR_INVALID_DEVICE);
 
   CUfunction Func;
   CUresult Ret = cuModuleGetFunction(&Func, hProgram->get(), pFunctionName);
diff --git a/source/adapters/cuda/program.hpp b/source/adapters/cuda/program.hpp
index feb3a09f31..5d41374d34 100644
--- a/source/adapters/cuda/program.hpp
+++ b/source/adapters/cuda/program.hpp
@@ -24,6 +24,7 @@ struct ur_program_handle_t_ {
   size_t BinarySizeInBytes;
   std::atomic_uint32_t RefCount;
   ur_context_handle_t Context;
+  ur_device_handle_t Device;
 
   /* The ur_program_binary_type_t property is defined individually for every
    * device in a program. However, since the CUDA adapter only has 1 device per
@@ -42,8 +43,17 @@ struct ur_program_handle_t_ {
   std::string BuildOptions;
   ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE;
 
-  ur_program_handle_t_(ur_context_handle_t Context);
-  ~ur_program_handle_t_();
+  ur_program_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device)
+      : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
+        Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{} {
+    urContextRetain(Context);
+    urDeviceRetain(Device);
+  }
+
+  ~ur_program_handle_t_() {
+    urContextRelease(Context);
+    urDeviceRelease(Device);
+  }
 
   ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length);
 
@@ -51,6 +61,7 @@ struct ur_program_handle_t_ {
 
   ur_result_t buildProgram(const char *BuildOptions);
   ur_context_handle_t getContext() const { return Context; };
+  ur_device_handle_t getDevice() const noexcept { return Device; };
 
   native_type get() const noexcept { return Module; };
 
diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp
index 120d665524..773126f4d5 100644
--- a/source/adapters/cuda/queue.cpp
+++ b/source/adapters/cuda/queue.cpp
@@ -123,7 +123,8 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
   try {
     std::unique_ptr<ur_queue_handle_t_> Queue{nullptr};
 
-    if (hContext->getDevice() != hDevice) {
+    if (std::find(hContext->getDevices().begin(), hContext->getDevices().end(),
+                  hDevice) == hContext->getDevices().end()) {
       *phQueue = nullptr;
       return UR_RESULT_ERROR_INVALID_DEVICE;
     }
@@ -145,10 +146,10 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
         IsOutOfOrder = true;
       }
       if (URFlags & UR_QUEUE_FLAG_PRIORITY_HIGH) {
-        ScopedContext Active(hContext);
+        ScopedContext Active(hDevice);
         UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(nullptr, &Priority));
       } else if (URFlags & UR_QUEUE_FLAG_PRIORITY_LOW) {
-        ScopedContext Active(hContext);
+        ScopedContext Active(hDevice);
         UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(&Priority, nullptr));
       }
     }
@@ -193,7 +194,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
     if (!hQueue->backendHasOwnership())
       return UR_RESULT_SUCCESS;
 
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
 
     hQueue->forEachStream([](CUstream S) {
       UR_CHECK_ERROR(cuStreamSynchronize(S));
@@ -212,7 +213,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
   ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext active(hQueue->getContext());
+    ScopedContext active(hQueue->getDevice());
 
     hQueue->syncStreams</*ResetUsed=*/true>(
         [](CUstream s) { UR_CHECK_ERROR(cuStreamSynchronize(s)); });
@@ -242,7 +243,7 @@ urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc,
                        ur_native_handle_t *phNativeQueue) {
   std::ignore = pDesc;
 
-  ScopedContext Active(hQueue->getContext());
+  ScopedContext Active(hQueue->getDevice());
   *phNativeQueue =
       reinterpret_cast<ur_native_handle_t>(hQueue->getNextComputeStream());
   return UR_RESULT_SUCCESS;
@@ -276,7 +277,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
       new ur_queue_handle_t_{std::move(ComputeCuStreams),
                              std::move(TransferCuStreams),
                              hContext,
-                             hContext->getDevice(),
+                             hDevice,
                              CuFlags,
                              Flags,
                              /*priority*/ 0,
diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp
index c79ca18a9b..46e9968fa9 100644
--- a/source/adapters/cuda/queue.hpp
+++ b/source/adapters/cuda/queue.hpp
@@ -97,6 +97,7 @@ struct ur_queue_handle_t_ {
                                    uint32_t *StreamToken = nullptr);
   native_type getNextTransferStream();
   native_type get() { return getNextComputeStream(); };
+  ur_device_handle_t getDevice() const noexcept { return Device; };
 
   bool hasBeenSynchronized(uint32_t StreamToken) {
     // stream token not associated with one of the compute streams
diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp
index a908f39654..4e6c6898d5 100644
--- a/source/adapters/cuda/usm.cpp
+++ b/source/adapters/cuda/usm.cpp
@@ -95,10 +95,9 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Pointer) {
+ur_result_t USMFreeImpl(ur_context_handle_t, void *Pointer) {
   ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    ScopedContext Active(Context);
     unsigned int IsManaged;
     unsigned int Type;
     void *AttributeValues[2] = {&IsManaged, &Type};
@@ -131,11 +130,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
   return USMFreeImpl(hContext, pMem);
 }
 
-ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t, ur_usm_device_mem_flags_t,
-                               size_t Size, uint32_t Alignment) {
+ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
+                               ur_device_handle_t Device,
+                               ur_usm_device_mem_flags_t, size_t Size,
+                               uint32_t Alignment) {
   try {
-    ScopedContext Active(Context);
+    ScopedContext Active(Device);
     UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ResultPtr, Size));
   } catch (ur_result_t Err) {
     return Err;
@@ -150,12 +150,13 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t, ur_usm_host_mem_flags_t,
+ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
+                               ur_device_handle_t Device,
+                               ur_usm_host_mem_flags_t,
                                ur_usm_device_mem_flags_t, size_t Size,
                                uint32_t Alignment) {
   try {
-    ScopedContext Active(Context);
+    ScopedContext Active(Device);
     UR_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)ResultPtr, Size,
                                      CU_MEM_ATTACH_GLOBAL));
   } catch (ur_result_t Err) {
@@ -171,11 +172,10 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t,
                              ur_usm_host_mem_flags_t, size_t Size,
                              uint32_t Alignment) {
   try {
-    ScopedContext Active(Context);
     UR_CHECK_ERROR(cuMemAllocHost(ResultPtr, Size));
   } catch (ur_result_t Err) {
     return Err;
@@ -199,7 +199,6 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
   UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
 
   try {
-    ScopedContext Active(hContext);
     switch (propName) {
     case UR_USM_ALLOC_INFO_TYPE: {
       unsigned int Value;
@@ -375,7 +374,7 @@ ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
 
 ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
                                              ur_usm_pool_desc_t *PoolDesc)
-    : Context(Context) {
+    : Context{Context} {
   const void *pNext = PoolDesc->pNext;
   while (pNext != nullptr) {
     const ur_base_desc_t *BaseDesc = static_cast<const ur_base_desc_t *>(pNext);
@@ -406,25 +405,25 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
           &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host])
           .second;
 
-  auto Device = Context->DeviceID;
-  MemProvider =
-      umf::memoryProviderMakeUnique<USMDeviceMemoryProvider>(Context, Device)
-          .second;
-  DeviceMemPool =
-      umf::poolMakeUniqueFromOps(
-          &UMF_DISJOINT_POOL_OPS, std::move(MemProvider),
-          &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device])
-          .second;
-
-  MemProvider =
-      umf::memoryProviderMakeUnique<USMSharedMemoryProvider>(Context, Device)
-          .second;
-  SharedMemPool =
-      umf::poolMakeUniqueFromOps(
-          &UMF_DISJOINT_POOL_OPS, std::move(MemProvider),
-          &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared])
-          .second;
-  Context->addPool(this);
+  for (const auto &Device : Context->getDevices()) {
+    MemProvider =
+        umf::memoryProviderMakeUnique<USMDeviceMemoryProvider>(Context, Device)
+            .second;
+    DeviceMemPool = umf::poolMakeUniqueFromOps(
+                        &UMF_DISJOINT_POOL_OPS, std::move(MemProvider),
+                        &this->DisjointPoolConfigs
+                             .Configs[usm::DisjointPoolMemType::Device])
+                        .second;
+    MemProvider =
+        umf::memoryProviderMakeUnique<USMSharedMemoryProvider>(Context, Device)
+            .second;
+    SharedMemPool = umf::poolMakeUniqueFromOps(
+                        &UMF_DISJOINT_POOL_OPS, std::move(MemProvider),
+                        &this->DisjointPoolConfigs
+                             .Configs[usm::DisjointPoolMemType::Shared])
+                        .second;
+    Context->addPool(this);
+  }
 }
 
 bool ur_usm_pool_handle_t_::hasUMFPool(umf_memory_pool_t *umf_pool) {
diff --git a/source/adapters/cuda/usm_p2p.cpp b/source/adapters/cuda/usm_p2p.cpp
index 810a11ef84..e09220d730 100644
--- a/source/adapters/cuda/usm_p2p.cpp
+++ b/source/adapters/cuda/usm_p2p.cpp
@@ -14,8 +14,8 @@
 UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp(
     ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) {
   try {
-    ScopedContext active(commandDevice->getContext());
-    UR_CHECK_ERROR(cuCtxEnablePeerAccess(peerDevice->getContext(), 0));
+    ScopedContext active(commandDevice);
+    UR_CHECK_ERROR(cuCtxEnablePeerAccess(peerDevice->getNativeContext(), 0));
   } catch (ur_result_t err) {
     return err;
   }
@@ -25,8 +25,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp(
 UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp(
     ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) {
   try {
-    ScopedContext active(commandDevice->getContext());
-    UR_CHECK_ERROR(cuCtxDisablePeerAccess(peerDevice->getContext()));
+    ScopedContext active(commandDevice);
+    UR_CHECK_ERROR(cuCtxDisablePeerAccess(peerDevice->getNativeContext()));
   } catch (ur_result_t err) {
     return err;
   }
@@ -43,7 +43,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp(
   int value;
   CUdevice_P2PAttribute cuAttr;
   try {
-    ScopedContext active(commandDevice->getContext());
+    ScopedContext active(commandDevice);
     switch (propName) {
     case UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORTED: {
       cuAttr = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED;
diff --git a/source/adapters/cuda/virtual_mem.cpp b/source/adapters/cuda/virtual_mem.cpp
index 9c37dda4fb..29908ad1d4 100644
--- a/source/adapters/cuda/virtual_mem.cpp
+++ b/source/adapters/cuda/virtual_mem.cpp
@@ -17,12 +17,12 @@
 #include <cuda.h>
 
 UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo(
-    ur_context_handle_t hContext, ur_device_handle_t hDevice,
+    ur_context_handle_t, ur_device_handle_t hDevice,
     ur_virtual_mem_granularity_info_t propName, size_t propSize,
     void *pPropValue, size_t *pPropSizeRet) {
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
-  ScopedContext Active(hContext);
+  ScopedContext Active(hDevice);
   switch (propName) {
   case UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM:
   case UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED: {
@@ -33,7 +33,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo(
     CUmemAllocationProp AllocProps = {};
     AllocProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
     AllocProps.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-    UR_CHECK_ERROR(GetDeviceOrdinal(hDevice, AllocProps.location.id));
+    AllocProps.location.id = hDevice->getIndex();
 
     size_t Granularity;
     UR_CHECK_ERROR(
@@ -50,15 +50,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo(
 UR_APIEXPORT ur_result_t UR_APICALL
 urVirtualMemReserve(ur_context_handle_t hContext, const void *pStart,
                     size_t size, void **ppStart) {
-  ScopedContext Active(hContext);
+  // Reserve the virtual mem. Only need to do once for arbitrary context
+  ScopedContext Active(hContext->getDevices()[0]);
   UR_CHECK_ERROR(cuMemAddressReserve((CUdeviceptr *)ppStart, size, 0,
                                      (CUdeviceptr)pStart, 0));
   return UR_RESULT_SUCCESS;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree(
-    ur_context_handle_t hContext, const void *pStart, size_t size) {
-  ScopedContext Active(hContext);
+UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree(ur_context_handle_t,
+                                                     const void *pStart,
+                                                     size_t size) {
   UR_CHECK_ERROR(cuMemAddressFree((CUdeviceptr)pStart, size));
   return UR_RESULT_SUCCESS;
 }
@@ -66,22 +67,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree(
 UR_APIEXPORT ur_result_t UR_APICALL
 urVirtualMemSetAccess(ur_context_handle_t hContext, const void *pStart,
                       size_t size, ur_virtual_mem_access_flags_t flags) {
-  CUmemAccessDesc AccessDesc = {};
-  if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE)
-    AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-  else if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY)
-    AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READ;
-  else
-    AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
-  AccessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  // TODO: When contexts support multiple devices, we should create a descriptor
-  //       for each. We may also introduce a variant of this function with a
-  //       specific device.
-  UR_CHECK_ERROR(
-      GetDeviceOrdinal(hContext->getDevice(), AccessDesc.location.id));
-
-  ScopedContext Active(hContext);
-  UR_CHECK_ERROR(cuMemSetAccess((CUdeviceptr)pStart, size, &AccessDesc, 1));
+  // Set access for every device in the context
+  for (auto &Device : hContext->getDevices()) {
+    CUmemAccessDesc AccessDesc = {};
+    if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE)
+      AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    else if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY)
+      AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READ;
+    else
+      AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
+    AccessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    AccessDesc.location.id = Device->getIndex();
+    ScopedContext Active(Device);
+    UR_CHECK_ERROR(cuMemSetAccess((CUdeviceptr)pStart, size, &AccessDesc, 1));
+  }
   return UR_RESULT_SUCCESS;
 }
 
@@ -89,7 +88,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, size_t size,
                 ur_physical_mem_handle_t hPhysicalMem, size_t offset,
                 ur_virtual_mem_access_flags_t flags) {
-  ScopedContext Active(hContext);
+  // Map the virtual mem. Only need to do once for arbitrary context
+  ScopedContext Active(hContext->getDevices()[0]);
   UR_CHECK_ERROR(
       cuMemMap((CUdeviceptr)pStart, size, offset, hPhysicalMem->get(), 0));
   if (flags)
@@ -99,7 +99,8 @@ urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, size_t size,
 
 UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemUnmap(
     ur_context_handle_t hContext, const void *pStart, size_t size) {
-  ScopedContext Active(hContext);
+  // Unmap the virtual mem. Only need to do once for arbitrary context
+  ScopedContext Active(hContext->getDevices()[0]);
   UR_CHECK_ERROR(cuMemUnmap((CUdeviceptr)pStart, size));
   return UR_RESULT_SUCCESS;
 }
@@ -110,12 +111,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGetInfo(
     size_t propSize, void *pPropValue, size_t *pPropSizeRet) {
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
-  ScopedContext Active(hContext);
+  // Set arbitrary context
+  ScopedContext Active(hContext->getDevices()[0]);
   switch (propName) {
   case UR_VIRTUAL_MEM_INFO_ACCESS_MODE: {
     CUmemLocation MemLocation = {};
     MemLocation.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    UR_CHECK_ERROR(GetDeviceOrdinal(hContext->getDevice(), MemLocation.id));
+    MemLocation.id = hContext->getDevices()[0]->getIndex();
 
     unsigned long long CuAccessFlags;
     UR_CHECK_ERROR(
diff --git a/test/adapters/cuda/context_tests.cpp b/test/adapters/cuda/context_tests.cpp
index 37742a002c..971601eb64 100644
--- a/test/adapters/cuda/context_tests.cpp
+++ b/test/adapters/cuda/context_tests.cpp
@@ -21,7 +21,9 @@ TEST_P(cudaUrContextCreateTest, CreateWithChildThread) {
 
     // Retrieve the CUDA context to check information is correct
     auto checkValue = [=] {
-        CUcontext cudaContext = context.handle->get();
+        // Just testing the first device in context
+        CUcontext cudaContext =
+            context.handle->getDevices()[0]->getNativeContext();
         unsigned int version = 0;
         EXPECT_SUCCESS_CUDA(cuCtxGetApiVersion(cudaContext, &version));
         EXPECT_EQ(version, known_cuda_api_version);
@@ -102,7 +104,8 @@ TEST_P(cudaUrContextCreateTest, ContextLifetimeExisting) {
 
     // check that context is now the active cuda context
     ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
-    ASSERT_EQ(current, context->get());
+    // Just testing the first device in context
+    ASSERT_EQ(current, context->getDevices()[0]->getNativeContext());
 }
 
 TEST_P(cudaUrContextCreateTest, ThreadedContext) {
@@ -173,7 +176,8 @@ TEST_P(cudaUrContextCreateTest, ThreadedContext) {
 
             // check that the 2nd context is now tha active cuda context
             ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
-            ASSERT_EQ(current, context2->get());
+            // Just checking for first device in context
+            ASSERT_EQ(current, context2->getDevices()[0]->getNativeContext());
         }
     });
 

From 57c53efe34278d5a4951bba022c02646eae1e295 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Wed, 1 May 2024 16:31:48 +0100
Subject: [PATCH 02/21] Add missing flags and pass ptr ptr

- Flags was not set in creating surface.
- Pass address of CUdeviceptr, not reinterpret cast
---
 source/adapters/cuda/enqueue.cpp |  4 ++--
 source/adapters/cuda/memory.cpp  | 13 ++++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 55d8cfd3cc..33907c49fc 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -651,9 +651,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    void *DevPtr = std::get<BufferMem>(hBuffer->Mem).getVoid(Device);
+    auto DevPtr = std::get<BufferMem>(hBuffer->Mem).getPtr(Device);
     UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect(
-        Stream, region, DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
+        Stream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
         bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin,
         hostRowPitch, hostSlicePitch));
 
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index 38f0ccf5d3..89abd287ed 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -458,15 +458,18 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
       }
       UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &Image.ArrayDesc));
       Image.Arrays[hDevice->getIndex()] = ImageArray;
-      // HIP_RESOURCE_DESC is a union of different structs, shown here
+
+      // CUDA_RESOURCE_DESC is a union of different structs, shown here
+      // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
       // We need to fill it as described here to use it for a surface or texture
-      // HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and
-      // HIP_RESOURCE_DESC::res::array::hArray must be set to a valid HIP array
-      // handle.
-      // HIP_RESOURCE_DESC::flags must be set to zero
+      // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
+      // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
+      // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA
+      // array handle. CUDA_RESOURCE_DESC::flags must be set to zero
       CUDA_RESOURCE_DESC ImageResDesc;
       ImageResDesc.res.array.hArray = ImageArray;
       ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+      ImageResDesc.flags = 0;
 
       UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
       Image.SurfObjs[hDevice->getIndex()] = Surface;

From 63428c76a1a605b5ac634ea642f75e5d87bef673 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Wed, 1 May 2024 17:21:22 +0100
Subject: [PATCH 03/21] Update the return val for context get info

Need to provide a size for context get info
---
 source/adapters/cuda/context.cpp     |  3 ++-
 test/adapters/cuda/context_tests.cpp | 10 ----------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/source/adapters/cuda/context.cpp b/source/adapters/cuda/context.cpp
index 7499534585..0d08dab76c 100644
--- a/source/adapters/cuda/context.cpp
+++ b/source/adapters/cuda/context.cpp
@@ -70,7 +70,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
   case UR_CONTEXT_INFO_NUM_DEVICES:
     return ReturnValue(static_cast<uint32_t>(hContext->getDevices().size()));
   case UR_CONTEXT_INFO_DEVICES:
-    return ReturnValue(hContext->getDevices());
+    return ReturnValue(hContext->getDevices().data(),
+                       hContext->getDevices().size());
   case UR_CONTEXT_INFO_REFERENCE_COUNT:
     return ReturnValue(hContext->getReferenceCount());
   case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
diff --git a/test/adapters/cuda/context_tests.cpp b/test/adapters/cuda/context_tests.cpp
index 971601eb64..2405cb34aa 100644
--- a/test/adapters/cuda/context_tests.cpp
+++ b/test/adapters/cuda/context_tests.cpp
@@ -96,16 +96,6 @@ TEST_P(cudaUrContextCreateTest, ContextLifetimeExisting) {
 
     // ensure the queue has the correct context
     ASSERT_EQ(context, queue->getContext());
-
-    // create a buffer in the context to set the context as active
-    uur::raii::Mem buffer;
-    ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, 1024,
-                                     nullptr, buffer.ptr()));
-
-    // check that context is now the active cuda context
-    ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
-    // Just testing the first device in context
-    ASSERT_EQ(current, context->getDevices()[0]->getNativeContext());
 }
 
 TEST_P(cudaUrContextCreateTest, ThreadedContext) {

From 578d3981047ae09964a590f03375072bfe9a0c4a Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Wed, 1 May 2024 17:44:52 +0100
Subject: [PATCH 04/21] Update tests

It doesn't make sense to check that a context has been set in the
background when a buffer is created.

Also, since images and buffers are now allocated lazily, we can't know
if the mem/image creation will fail at `urMemImageCreate`. Maybe this is
a problem and we should allocate eagerly for single device contexts.
---
 test/adapters/cuda/context_tests.cpp              | 11 -----------
 test/conformance/memory/memory_adapter_cuda.match |  1 +
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/test/adapters/cuda/context_tests.cpp b/test/adapters/cuda/context_tests.cpp
index 2405cb34aa..7d6a329b9a 100644
--- a/test/adapters/cuda/context_tests.cpp
+++ b/test/adapters/cuda/context_tests.cpp
@@ -120,7 +120,6 @@ TEST_P(cudaUrContextCreateTest, ThreadedContext) {
     // the first context, and then create and release another queue with
     // the second context.
     auto test_thread = std::thread([&] {
-        CUcontext current = nullptr;
 
         {
             // create a queue with the first context
@@ -158,16 +157,6 @@ TEST_P(cudaUrContextCreateTest, ThreadedContext) {
 
             // ensure queue has correct context
             ASSERT_EQ(context2, queue->getContext());
-
-            // create a buffer to set the active context
-            uur::raii::Mem buffer = nullptr;
-            ASSERT_SUCCESS(urMemBufferCreate(context2, UR_MEM_FLAG_READ_WRITE,
-                                             1024, nullptr, buffer.ptr()));
-
-            // check that the 2nd context is now tha active cuda context
-            ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
-            // Just checking for first device in context
-            ASSERT_EQ(current, context2->getDevices()[0]->getNativeContext());
         }
     });
 
diff --git a/test/conformance/memory/memory_adapter_cuda.match b/test/conformance/memory/memory_adapter_cuda.match
index cd65787020..4b5c751d67 100644
--- a/test/conformance/memory/memory_adapter_cuda.match
+++ b/test/conformance/memory/memory_adapter_cuda.match
@@ -1 +1,2 @@
 urMemBufferCreateWithNativeHandleTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_
+urMemImageCreateTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_

From 4c0476cc57123972d86eec261e0df2efae8526cf Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 2 May 2024 10:12:32 +0100
Subject: [PATCH 05/21] Update program tests

Programs were being compiled with compile with IL, which cannot be
supported for a multi device context since we want programs to be unique
to a device.
---
 .../program/program_adapter_cuda.match        | 39 ++++++-------------
 test/conformance/source/environment.cpp       |  7 ++--
 2 files changed, 15 insertions(+), 31 deletions(-)

diff --git a/test/conformance/program/program_adapter_cuda.match b/test/conformance/program/program_adapter_cuda.match
index 8b917219ec..8a4835f942 100644
--- a/test/conformance/program/program_adapter_cuda.match
+++ b/test/conformance/program/program_adapter_cuda.match
@@ -1,29 +1,12 @@
 urProgramBuildTest.BuildFailure/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urProgramGetBuildInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_OPTIONS
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_LOG
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_OPTIONS
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_LOG
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE
-{{OPT}}urProgramGetBuildInfoSingleTest.LogIsNullTerminated/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS
-{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_REFERENCE_COUNT
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_CONTEXT
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_DEVICES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_DEVICES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_SOURCE
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_BINARY_SIZES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_BINARIES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES
-{{OPT}}urProgramLinkTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urProgramSetSpecializationConstantsTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urProgramSetSpecializationConstantsTest.UseDefaultValue/NVIDIA_CUDA_BACKEND___{{.*}}_
-urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/NVIDIA_CUDA_BACKEND___{{.*}}_
-urProgramSetMultipleSpecializationConstantsTest.SingleCall/NVIDIA_CUDA_BACKEND___{{.*}}_
+{{OPT}}urProgramCreateWithILTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramCreateWithILTest.SuccessWithProperties/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramCreateWithILTest.BuildInvalidProgram/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramGetInfoSingleTest.NumDevicesMatchesDeviceArray/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramSetSpecializationConstantsTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramSetSpecializationConstantsTest.UseDefaultValue/NVIDIA_CUDA_BACKEND___{{.*}}
+urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/NVIDIA_CUDA_BACKEND___{{.*}}
+urProgramSetMultipleSpecializationConstantsTest.SingleCall/NVIDIA_CUDA_BACKEND___{{.*}}
diff --git a/test/conformance/source/environment.cpp b/test/conformance/source/environment.cpp
index a5f83c0d80..209bc8f9a2 100644
--- a/test/conformance/source/environment.cpp
+++ b/test/conformance/source/environment.cpp
@@ -465,9 +465,10 @@ ur_result_t KernelsEnvironment::CreateProgram(
                                        nullptr)) {
         return error;
     }
-    if (backend == UR_PLATFORM_BACKEND_HIP) {
-        // The HIP adapter does not support urProgramCreateWithIL so we need to
-        // use urProgramCreateWithBinary instead.
+    if (backend == UR_PLATFORM_BACKEND_HIP ||
+        backend == UR_PLATFORM_BACKEND_CUDA) {
+        // The CUDA and HIP adapters do not support urProgramCreateWithIL so we
+        // need to use urProgramCreateWithBinary instead.
         if (auto error = urProgramCreateWithBinary(
                 hContext, hDevice, binary.size(),
                 reinterpret_cast<const uint8_t *>(binary.data()), properties,

From 1c3dcb761c621afd00a840c60c024785ee545188 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 2 May 2024 10:17:43 +0100
Subject: [PATCH 06/21] Check Properties for nullptr

Properties can be nullptr in some cases, default to read write.
---
 source/adapters/cuda/kernel.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
index a3c5e607a9..675fdbe0a3 100644
--- a/source/adapters/cuda/kernel.cpp
+++ b/source/adapters/cuda/kernel.cpp
@@ -303,7 +303,9 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
   ur_result_t Result = UR_RESULT_SUCCESS;
   try {
     auto Device = hKernel->getProgram()->getDevice();
-    hKernel->Args.addMemObjArg(argIndex, hArgValue, Properties->memoryAccess);
+    ur_mem_flags_t MemAccess =
+        Properties ? Properties->memoryAccess : UR_MEM_FLAG_READ_WRITE;
+    hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess);
     if (hArgValue->isImage()) {
       CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
       UR_CHECK_ERROR(cuArray3DGetDescriptor(

From f83d121beb4e19328dbf940b107bbe0b99137439 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 2 May 2024 10:45:41 +0100
Subject: [PATCH 07/21] Clang format

---
 test/adapters/cuda/context_tests.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/adapters/cuda/context_tests.cpp b/test/adapters/cuda/context_tests.cpp
index 7d6a329b9a..b08e22c8f9 100644
--- a/test/adapters/cuda/context_tests.cpp
+++ b/test/adapters/cuda/context_tests.cpp
@@ -120,7 +120,6 @@ TEST_P(cudaUrContextCreateTest, ThreadedContext) {
     // the first context, and then create and release another queue with
     // the second context.
     auto test_thread = std::thread([&] {
-
         {
             // create a queue with the first context
             uur::raii::Queue queue;

From f8eba018a6ce106789cc69b3a13455708ae995c0 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 13 May 2024 13:48:54 +0100
Subject: [PATCH 08/21] Don't use LastDeviceWritingToMemObj

LastEventWritingToMemObj is never created with interop, meaning
getQueue()->getDevice() should never fail. So we don't need
LastDeviceWritingToMemObj.
---
 source/adapters/cuda/enqueue.cpp | 18 ++++++++++--------
 source/adapters/cuda/memory.cpp  | 23 +++++++++++++++--------
 source/adapters/cuda/memory.hpp  | 20 +++++++-------------
 3 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 33907c49fc..4dde1868bf 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -499,8 +499,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         // if it has been written to
         if (phEvent && (MemArg.AccessFlags &
                         (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) {
-          MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get(),
-                                                  hQueue->getDevice());
+          MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get());
         }
       }
       // We can release the MemoryMigrationMutexes now
@@ -629,8 +628,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
     if (hBuffer->LastEventWritingToMemObj &&
-        hBuffer->LastDeviceWritingToMemObj != hQueue->getDevice()) {
-      Device = hBuffer->LastDeviceWritingToMemObj;
+        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
+            hQueue->getDevice()) {
+      Device = hBuffer->LastEventWritingToMemObj->getQueue()->getDevice();
       ScopedContext Active(Device);
       Stream = CUstream{0}; // Default stream for different device
       // We may have to wait for an event on another queue if it is the last
@@ -1018,8 +1018,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     // last queue to write to the Image, meaning we must perform the copy
     // from a different device
     if (hImage->LastEventWritingToMemObj &&
-        hImage->LastDeviceWritingToMemObj != hQueue->getDevice()) {
-      Device = hImage->LastDeviceWritingToMemObj;
+        hImage->LastEventWritingToMemObj->getQueue()->getDevice() !=
+            hQueue->getDevice()) {
+      Device = hImage->LastEventWritingToMemObj->getQueue()->getDevice();
       ScopedContext Active(Device);
       Stream = CUstream{0}; // Default stream for different device
       // We may have to wait for an event on another queue if it is the last
@@ -1636,8 +1637,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
     if (hBuffer->LastEventWritingToMemObj &&
-        hBuffer->LastDeviceWritingToMemObj != hQueue->getDevice()) {
-      Device = hBuffer->LastDeviceWritingToMemObj;
+        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
+            hQueue->getDevice()) {
+      Device = hBuffer->LastEventWritingToMemObj->getQueue()->getDevice();
       ScopedContext Active(Device);
       Stream = CUstream{0}; // Default stream for different device
       // We may have to wait for an event on another queue if it is the last
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index 89abd287ed..b93885f5e5 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -498,10 +498,12 @@ ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
       UR_CHECK_ERROR(
           cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
     }
-  } else if (Mem->LastDeviceWritingToMemObj != hDevice) {
-    UR_CHECK_ERROR(cuMemcpyDtoD(Buffer.getPtr(hDevice),
-                                Buffer.getPtr(Mem->LastDeviceWritingToMemObj),
-                                Buffer.Size));
+  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
+             hDevice) {
+    UR_CHECK_ERROR(cuMemcpyDtoD(
+        Buffer.getPtr(hDevice),
+        Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
+        Buffer.Size));
   }
   return UR_RESULT_SUCCESS;
 }
@@ -551,19 +553,24 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
       CpyDesc3D.srcHost = Image.HostPtr;
       UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
     }
-  } else if (Mem->LastDeviceWritingToMemObj != hDevice) {
+  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
+             hDevice) {
     if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
       // FIXME: 1D memcpy from DtoD going through the host.
       UR_CHECK_ERROR(cuMemcpyAtoH(
-          Image.HostPtr, Image.getArray(Mem->LastDeviceWritingToMemObj),
+          Image.HostPtr,
+          Image.getArray(
+              Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
           0 /*srcOffset*/, ImageSizeBytes));
       UR_CHECK_ERROR(
           cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
-      CpyDesc2D.srcArray = Image.getArray(Mem->LastDeviceWritingToMemObj);
+      CpyDesc2D.srcArray = Image.getArray(
+          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
       UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
-      CpyDesc3D.srcArray = Image.getArray(Mem->LastDeviceWritingToMemObj);
+      CpyDesc3D.srcArray = Image.getArray(
+          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
       UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
     }
   }
diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp
index 9dcbe23a22..66e55e3bef 100644
--- a/source/adapters/cuda/memory.hpp
+++ b/source/adapters/cuda/memory.hpp
@@ -18,6 +18,7 @@
 #include "common.hpp"
 #include "context.hpp"
 #include "device.hpp"
+#include "event.hpp"
 
 ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
                                            const ur_device_handle_t);
@@ -330,7 +331,8 @@ struct SurfaceMem {
 ///   2. urEnqueueMem(Buffer|Image)Read(Rect)
 ///
 /// Migrations will occur in both cases if the most recent version of data
-/// is on a different device, marked by LastDeviceWritingToMemObj
+/// is on a different device, marked by
+/// LastEventWritingToMemObj->getQueue()->getDevice()
 ///
 /// Example trace:
 /// ~~~~~~~~~~~~~~
@@ -407,11 +409,6 @@ struct ur_mem_handle_t_ {
   // in this ur_mem_handle_t_
   ur_event_handle_t LastEventWritingToMemObj{nullptr};
 
-  // Since the event may not contain device info (if using interop, which
-  // doesn't take a queue) we should use this member var to keep track of which
-  // device has most recent view of data
-  ur_device_handle_t LastDeviceWritingToMemObj{nullptr};
-
   // Enumerates all possible types of accesses.
   enum access_mode_t { unknown, read_write, read_only, write_only };
 
@@ -505,23 +502,20 @@ struct ur_mem_handle_t_ {
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 
-  void setLastEventWritingToMemObj(ur_event_handle_t NewEvent,
-                                   ur_device_handle_t RecentDevice) {
+  void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) {
     assert(NewEvent && "Invalid event!");
     // This entry point should only ever be called when using multi device ctx
     assert(Context->Devices.size() > 1);
     urEventRetain(NewEvent);
-    urDeviceRetain(RecentDevice);
     if (LastEventWritingToMemObj != nullptr) {
       urEventRelease(LastEventWritingToMemObj);
     }
-    if (LastDeviceWritingToMemObj != nullptr) {
-      urDeviceRelease(LastDeviceWritingToMemObj);
-    }
     LastEventWritingToMemObj = NewEvent;
     for (const auto &Device : Context->getDevices()) {
+      // This event is never an interop event so will always have an associated
+      // queue
       HaveMigratedToDeviceSinceLastWrite[Device->getIndex()] =
-          Device == RecentDevice;
+          Device == NewEvent->getQueue()->getDevice();
     }
   }
 };

From edb1842c240504fe0519087db1ba0834255916c1 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 13 May 2024 14:39:01 +0100
Subject: [PATCH 09/21] Change the order of setting contexts

enqueueEventsWait may set a different context, so make sure a context is
set just before a command starts.
---
 source/adapters/cuda/enqueue.cpp | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 4dde1868bf..8b085cc816 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -630,7 +630,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
     if (hBuffer->LastEventWritingToMemObj &&
         hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
             hQueue->getDevice()) {
-      Device = hBuffer->LastEventWritingToMemObj->getQueue()->getDevice();
+      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
+      Device = hQueue->getDevice();
       ScopedContext Active(Device);
       Stream = CUstream{0}; // Default stream for different device
       // We may have to wait for an event on another queue if it is the last
@@ -639,11 +640,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
                                        &hBuffer->LastEventWritingToMemObj));
     }
 
-    ScopedContext Active(Device);
-
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
                                      phEventWaitList));
 
+    // enqueueEventsWait may set a context so we need to reset it here
+    ScopedContext Active(Device);
+
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
@@ -1639,7 +1641,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
     if (hBuffer->LastEventWritingToMemObj &&
         hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
             hQueue->getDevice()) {
-      Device = hBuffer->LastEventWritingToMemObj->getQueue()->getDevice();
+      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
+      Device = hQueue->getDevice();
       ScopedContext Active(Device);
       Stream = CUstream{0}; // Default stream for different device
       // We may have to wait for an event on another queue if it is the last
@@ -1648,11 +1651,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
                                        &hBuffer->LastEventWritingToMemObj));
     }
 
-    ScopedContext Active(Device);
-
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
                                      phEventWaitList));
 
+    // enqueueEventsWait may set a context so we need to reset it here
+    ScopedContext Active(Device);
+
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(

From 59a14fa12ee5475dd6d0ee76064c3270901bf980 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Mon, 13 May 2024 17:01:51 +0100
Subject: [PATCH 10/21] Fix race condition

Wasn't using the correct event dependencies. This fixes that.
---
 source/adapters/cuda/enqueue.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 8b085cc816..fc1372e37c 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -474,8 +474,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
     if (DepEvents.size()) {
-      UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                                       phEventWaitList));
+      UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(),
+                                       DepEvents.data()));
     }
 
     // For memory migration across devices in the same context

From 87afd90a96928687ce7480a8966bfc9da0de5c09 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 14 May 2024 10:32:01 +0100
Subject: [PATCH 11/21] Change from UR_CHECK_ERROR to manually return

A call that might return UR_RESULT_ERROR_ADAPTER_SPECIFIC is not handled
by UR_CHECK_ERROR.
---
 source/adapters/cuda/enqueue.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index fc1372e37c..bf00fe58e3 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -459,10 +459,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   uint32_t LocalSize = hKernel->getLocalSize();
   CUfunction CuFunc = hKernel->get();
 
-  UR_CHECK_ERROR(setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
-                                 pGlobalWorkOffset, pGlobalWorkSize,
-                                 pLocalWorkSize, hKernel, CuFunc,
-                                 ThreadsPerBlock, BlocksPerGrid));
+  // This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled
+  // using the standard UR_CHECK_ERROR
+  if (ur_result_t Ret =
+          setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
+                          pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+                          hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
+      Ret != UR_RESULT_SUCCESS)
+    return Ret;
 
   try {
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};

From def30e8822b34522f285d7256851cc7dcf54159d Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 14 May 2024 10:40:51 +0100
Subject: [PATCH 12/21] Don't change order of active context

---
 source/adapters/cuda/enqueue.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index bf00fe58e3..b184f2f04f 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -644,12 +644,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
                                        &hBuffer->LastEventWritingToMemObj));
     }
 
+    ScopedContext Active(Device);
+
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
                                      phEventWaitList));
 
-    // enqueueEventsWait may set a context so we need to reset it here
-    ScopedContext Active(Device);
-
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
@@ -1655,12 +1654,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
                                        &hBuffer->LastEventWritingToMemObj));
     }
 
+    ScopedContext Active(Device);
+
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
                                      phEventWaitList));
 
-    // enqueueEventsWait may set a context so we need to reset it here
-    ScopedContext Active(Device);
-
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(

From 3c66fbeb78dc72e4e0234bebe289c27a52201963 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 14 May 2024 10:56:18 +0100
Subject: [PATCH 13/21] Don't remove host allocation funcs

Reinstate accidental removal of host register/allocation funcs.
---
 source/adapters/cuda/memory.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index b93885f5e5..b756f05061 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -43,8 +43,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
     BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic;
 
     if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) {
+      UR_CHECK_ERROR(
+          cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
       AllocMode = BufferMem::AllocMode::UseHostPtr;
     } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
+      UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size));
       AllocMode = BufferMem::AllocMode::AllocHostPtr;
     } else if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
       AllocMode = BufferMem::AllocMode::CopyIn;

From 776317ae5d71ce95bd65709e4b491c22cbd3fe00 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 14 May 2024 11:26:26 +0100
Subject: [PATCH 14/21] Add missing HostPtr arg

Removed in error.
---
 source/adapters/cuda/memory.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index b756f05061..6aa4ef4be0 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -529,6 +529,7 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
   if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
     memset(&CpyDesc2D, 0, sizeof(CpyDesc2D));
     CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+    CpyDesc2D.srcHost = Image.HostPtr;
     CpyDesc2D.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
     CpyDesc2D.dstArray = ImageArray;
     CpyDesc2D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width;
@@ -537,6 +538,7 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
   } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
     memset(&CpyDesc3D, 0, sizeof(CpyDesc3D));
     CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+    CpyDesc3D.srcHost = Image.HostPtr;
     CpyDesc3D.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
     CpyDesc3D.dstArray = ImageArray;
     CpyDesc3D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width;

From 46f76ff6163f688a458b52bfa672c9f0554fa90e Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 14 May 2024 11:42:59 +0100
Subject: [PATCH 15/21] Update queue if copying from different device

If we need to copy from a different queue to the one passed to the UR
entry point, we should make all operations work on that different queue.
---
 source/adapters/cuda/enqueue.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index b184f2f04f..badc3778c7 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -1025,7 +1025,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     if (hImage->LastEventWritingToMemObj &&
         hImage->LastEventWritingToMemObj->getQueue()->getDevice() !=
             hQueue->getDevice()) {
-      Device = hImage->LastEventWritingToMemObj->getQueue()->getDevice();
+      hQueue = hImage->LastEventWritingToMemObj->getQueue();
+      Device = hQueue->getDevice();
       ScopedContext Active(Device);
       Stream = CUstream{0}; // Default stream for different device
       // We may have to wait for an event on another queue if it is the last

From a1566dd5618ec85da377ec6f576ae3f286c475b1 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 14 May 2024 14:54:17 +0100
Subject: [PATCH 16/21] Return device for get info

Not the device index.
---
 source/adapters/cuda/program.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp
index eed17b9116..c5cb763ada 100644
--- a/source/adapters/cuda/program.cpp
+++ b/source/adapters/cuda/program.cpp
@@ -374,8 +374,7 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
   case UR_PROGRAM_INFO_NUM_DEVICES:
     return ReturnValue(1u);
   case UR_PROGRAM_INFO_DEVICES:
-    return ReturnValue(
-        static_cast<uint32_t>(hProgram->getDevice()->getIndex()));
+    return ReturnValue(&hProgram->Device, 1);
   case UR_PROGRAM_INFO_SOURCE:
     return ReturnValue(hProgram->Binary);
   case UR_PROGRAM_INFO_BINARY_SIZES:

From b51cc46d674ebd043d97a70b8aa1ef8ac621702d Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 14 May 2024 15:22:36 +0100
Subject: [PATCH 17/21] Check that the queue is nullptr

If the call to urQueueCreate fails, then we expect the queue to hold
nullptr.
---
 test/conformance/queue/urQueueCreate.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/conformance/queue/urQueueCreate.cpp b/test/conformance/queue/urQueueCreate.cpp
index 168285d3f8..ad0957d747 100644
--- a/test/conformance/queue/urQueueCreate.cpp
+++ b/test/conformance/queue/urQueueCreate.cpp
@@ -140,7 +140,7 @@ TEST_F(urQueueCreateTestMultipleDevices, ContextFromWrongDevice) {
     ur_queue_handle_t queue = nullptr;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_DEVICE,
                      urQueueCreate(context2, device1, nullptr, &queue));
-    ASSERT_NE(queue, nullptr);
+    ASSERT_EQ(queue, nullptr);
 }
 
 TEST_P(urQueueCreateTest, InvalidNullHandleContext) {

From 24b3336e5aab4b669371deaab20c8c8c1ed79164 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 14 May 2024 15:30:24 +0100
Subject: [PATCH 18/21] Update XFAILS

---
 test/conformance/program/program_adapter_cuda.match | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/conformance/program/program_adapter_cuda.match b/test/conformance/program/program_adapter_cuda.match
index 8a4835f942..5f6c2de212 100644
--- a/test/conformance/program/program_adapter_cuda.match
+++ b/test/conformance/program/program_adapter_cuda.match
@@ -2,6 +2,7 @@ urProgramBuildTest.BuildFailure/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urProgramCreateWithILTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
 {{OPT}}urProgramCreateWithILTest.SuccessWithProperties/NVIDIA_CUDA_BACKEND___{{.*}}
 {{OPT}}urProgramCreateWithILTest.BuildInvalidProgram/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramGetBuildInfoSingleTest.LogIsNullTerminated/NVIDIA_CUDA_BACKEND___{{.*}}
 {{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
 {{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
 {{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}

From c0da06eef03251eec869a896d6391a576e8e67e2 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Thu, 16 May 2024 16:50:01 +0100
Subject: [PATCH 19/21] Store HostPtr in ur mem

We might want to write this mem into an image on another device.
---
 source/adapters/cuda/enqueue.cpp | 10 +++++++---
 source/adapters/cuda/memory.cpp  | 26 +++++++++++++-------------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index badc3778c7..8a02d5e31a 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -1098,6 +1098,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
   std::ignore = slicePitch;
 
   UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  auto &Image = std::get<SurfaceMem>(hImage->Mem);
+  // FIXME: We are assuming that the lifetime of host ptr lives as long as the
+  // image
+  if (!Image.HostPtr)
+    Image.HostPtr = pSrc;
 
   ur_result_t Result = UR_RESULT_SUCCESS;
 
@@ -1107,8 +1112,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
-    CUarray Array =
-        std::get<SurfaceMem>(hImage->Mem).getArray(hQueue->getDevice());
+    CUarray Array = Image.getArray(hQueue->getDevice());
 
     CUDA_ARRAY_DESCRIPTOR ArrayDesc;
     UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
@@ -1126,7 +1130,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    ur_mem_type_t ImgType = std::get<SurfaceMem>(hImage->Mem).getType();
+    ur_mem_type_t ImgType = Image.getType();
     if (ImgType == UR_MEM_TYPE_IMAGE1D) {
       UR_CHECK_ERROR(
           cuMemcpyHtoAAsync(Array, ByteOffsetX, pSrc, BytesToCopy, CuStream));
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index 6aa4ef4be0..301c1d99e1 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -528,35 +528,35 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
   // dimensionality
   if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
     memset(&CpyDesc2D, 0, sizeof(CpyDesc2D));
-    CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
     CpyDesc2D.srcHost = Image.HostPtr;
     CpyDesc2D.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
     CpyDesc2D.dstArray = ImageArray;
     CpyDesc2D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width;
     CpyDesc2D.Height = Image.ImageDesc.height;
-    UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
   } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
     memset(&CpyDesc3D, 0, sizeof(CpyDesc3D));
-    CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
     CpyDesc3D.srcHost = Image.HostPtr;
     CpyDesc3D.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
     CpyDesc3D.dstArray = ImageArray;
     CpyDesc3D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width;
     CpyDesc3D.Height = Image.ImageDesc.height;
     CpyDesc3D.Depth = Image.ImageDesc.depth;
-    UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
   }
 
   if (Mem->LastEventWritingToMemObj == nullptr) {
-    if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
-      UR_CHECK_ERROR(
-          cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
-    } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
-      CpyDesc2D.srcHost = Image.HostPtr;
-      UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
-    } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
-      CpyDesc3D.srcHost = Image.HostPtr;
-      UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+    if (Image.HostPtr) {
+      if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+        UR_CHECK_ERROR(
+            cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
+      } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+        CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+        CpyDesc2D.srcHost = Image.HostPtr;
+        UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+      } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+        CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+        CpyDesc3D.srcHost = Image.HostPtr;
+        UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+      }
     }
   } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
              hDevice) {

From ae74ef4e2d68d97211ee02dc32ac9259ed5939e6 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 21 May 2024 10:41:03 +0100
Subject: [PATCH 20/21] Make sure context is set before getting stream

Context was not set before getting stream, leading to invalid context in
multi threaded application.
---
 source/adapters/cuda/enqueue.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 8a02d5e31a..9627fc6da2 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -625,6 +625,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
 
   ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
   auto Device = hQueue->getDevice();
+  ScopedContext Active(Device);
   CUstream Stream = hQueue->getNextTransferStream();
 
   try {
@@ -644,8 +645,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
                                        &hBuffer->LastEventWritingToMemObj));
     }
 
-    ScopedContext Active(Device);
-
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
                                      phEventWaitList));
 
@@ -1640,6 +1639,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
   ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
   auto Device = hQueue->getDevice();
+  ScopedContext Active(Device);
   CUstream Stream = hQueue->getNextTransferStream();
 
   try {
@@ -1659,8 +1659,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
                                        &hBuffer->LastEventWritingToMemObj));
     }
 
-    ScopedContext Active(Device);
-
     UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
                                      phEventWaitList));
 

From 714200608150438d19ec5ffa63ee2cfb47f8ff55 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <hugh.delaney@codeplay.com>
Date: Tue, 21 May 2024 11:18:13 +0100
Subject: [PATCH 21/21] Don't assume context contains all devices in platform

When indexing into Ptrs which are per ctx, don't use the per platform
device index.
---
 source/adapters/cuda/memory.cpp | 16 ++++++++++------
 source/adapters/cuda/memory.hpp | 12 +++++++-----
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index 301c1d99e1..c28ce98748 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -433,7 +433,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
 
   if (Mem->isBuffer()) {
     auto &Buffer = std::get<BufferMem>(Mem->Mem);
-    auto &DevPtr = Buffer.Ptrs[hDevice->getIndex()];
+    auto &DevPtr = Buffer.Ptrs[hDevice->getIndex() % Buffer.Ptrs.size()];
 
     // Allocation has already been made
     if (DevPtr != BufferMem::native_type{0}) {
@@ -456,11 +456,11 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
     try {
       auto &Image = std::get<SurfaceMem>(Mem->Mem);
       // Allocation has already been made
-      if (Image.Arrays[hDevice->getIndex()]) {
+      if (Image.Arrays[hDevice->getIndex() % Image.Arrays.size()]) {
         return UR_RESULT_SUCCESS;
       }
       UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &Image.ArrayDesc));
-      Image.Arrays[hDevice->getIndex()] = ImageArray;
+      Image.Arrays[hDevice->getIndex() % Image.Arrays.size()] = ImageArray;
 
       // CUDA_RESOURCE_DESC is a union of different structs, shown here
       // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
@@ -475,7 +475,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
       ImageResDesc.flags = 0;
 
       UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
-      Image.SurfObjs[hDevice->getIndex()] = Surface;
+      Image.SurfObjs[hDevice->getIndex() % Image.SurfObjs.size()] = Surface;
     } catch (ur_result_t Err) {
       if (ImageArray) {
         UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
@@ -590,7 +590,9 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
   UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
   // Device allocation has already been initialized with most up to date
   // data in buffer
-  if (Mem->HaveMigratedToDeviceSinceLastWrite[hDevice->getIndex()]) {
+  if (Mem->HaveMigratedToDeviceSinceLastWrite
+          [hDevice->getIndex() %
+           Mem->HaveMigratedToDeviceSinceLastWrite.size()]) {
     return UR_RESULT_SUCCESS;
   }
 
@@ -601,6 +603,8 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
     UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
   }
 
-  Mem->HaveMigratedToDeviceSinceLastWrite[hDevice->getIndex()] = true;
+  Mem->HaveMigratedToDeviceSinceLastWrite
+      [hDevice->getIndex() % Mem->HaveMigratedToDeviceSinceLastWrite.size()] =
+      true;
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp
index 66e55e3bef..6b7e9d0156 100644
--- a/source/adapters/cuda/memory.hpp
+++ b/source/adapters/cuda/memory.hpp
@@ -104,7 +104,8 @@ struct BufferMem {
       throw Err;
     }
     return reinterpret_cast<native_type>(
-        reinterpret_cast<uint8_t *>(Ptrs[Device->getIndex()]) + Offset);
+        reinterpret_cast<uint8_t *>(Ptrs[Device->getIndex() % Ptrs.size()]) +
+        Offset);
   }
 
   native_type getPtr(const ur_device_handle_t Device) {
@@ -274,7 +275,7 @@ struct SurfaceMem {
         Err != UR_RESULT_SUCCESS) {
       throw Err;
     }
-    return Arrays[Device->getIndex()];
+    return Arrays[Device->getIndex() % Arrays.size()];
   }
   // Will allocate a new surface on device if not already allocated
   CUsurfObject getSurface(const ur_device_handle_t Device) {
@@ -283,7 +284,7 @@ struct SurfaceMem {
         Err != UR_RESULT_SUCCESS) {
       throw Err;
     }
-    return SurfObjs[Device->getIndex()];
+    return SurfObjs[Device->getIndex() % SurfObjs.size()];
   }
 
   ur_mem_type_t getType() { return ImageDesc.type; }
@@ -514,8 +515,9 @@ struct ur_mem_handle_t_ {
     for (const auto &Device : Context->getDevices()) {
       // This event is never an interop event so will always have an associated
       // queue
-      HaveMigratedToDeviceSinceLastWrite[Device->getIndex()] =
-          Device == NewEvent->getQueue()->getDevice();
+      HaveMigratedToDeviceSinceLastWrite
+          [Device->getIndex() % HaveMigratedToDeviceSinceLastWrite.size()] =
+              Device == NewEvent->getQueue()->getDevice();
     }
   }
 };