diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 186310287f212..d766ffe9f0501 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -24,7 +24,8 @@ std::string getCudaVersionString() {
   cuDriverGetVersion(&driver_version);
   // The version is returned as (1000 major + 10 minor).
   std::stringstream stream;
-  stream << "CUDA " << driver_version / 1000 << "." << driver_version % 100;
+  stream << "CUDA " << driver_version / 1000 << "."
+         << driver_version % 1000 / 10;
   return stream.str();
 }
 
@@ -471,6 +472,13 @@ pi_result getInfo<const char *>(size_t param_value_size, void *param_value,
                       param_value_size_ret, value);
 }
 
+int getAttribute(pi_device device, CUdevice_attribute attribute) {
+  int value;
+  cl::sycl::detail::pi::assertion(
+      cuDeviceGetAttribute(&value, attribute, device->get()) == CUDA_SUCCESS);
+  return value;
+}
+
 /// Finds kernel names by searching for entry points in the PTX source, as the
 /// CUDA driver API doesn't expose an operation for this.
 /// Note: This is currently only being used by the SYCL program class for the
@@ -1176,6 +1184,125 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
     return getInfo(param_value_size, param_value, param_value_size_ret,
                    static_cast<cl_device_partition_property>(0u));
   }
+
+    // Intel USM extensions
+
+  case PI_DEVICE_INFO_USM_HOST_SUPPORT: {
+    // from cl_intel_unified_shared_memory: "The host memory access capabilities
+    // apply to any host allocation."
+    //
+    // query if/how the device can access page-locked host memory, possibly
+    // through PCIe, using the same pointer as the host
+    pi_bitfield value = {};
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING)) {
+      // the device shares a unified address space with the host
+      if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
+          6) {
+        // compute capability 6.x introduces operations that are atomic with
+        // respect to other CPUs and GPUs in the system
+        value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
+                PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS;
+      } else {
+        // on GPU architectures with compute capability lower than 6.x, atomic
+        // operations from the GPU to CPU memory will not be atomic with respect
+        // to CPU initiated atomic operations
+        value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
+      }
+    }
+    return getInfo(param_value_size, param_value, param_value_size_ret, value);
+  }
+  case PI_DEVICE_INFO_USM_DEVICE_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The device memory access capabilities apply to any device allocation
+    // associated with this device."
+    //
+    // query how the device can access memory allocated on the device itself (?)
+    pi_bitfield value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
+                        PI_USM_CONCURRENT_ACCESS |
+                        PI_USM_CONCURRENT_ATOMIC_ACCESS;
+    return getInfo(param_value_size, param_value, param_value_size_ret, value);
+  }
+  case PI_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The single device shared memory access capabilities apply to any shared
+    // allocation associated with this device."
+    //
+    // query if/how the device can access managed memory associated to it
+    pi_bitfield value = {};
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
+      // the device can allocate managed memory on this system
+      value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS;
+    }
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+      // the device can coherently access managed memory concurrently with the
+      // CPU
+      value |= PI_USM_CONCURRENT_ACCESS;
+      if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
+          6) {
+        // compute capability 6.x introduces operations that are atomic with
+        // respect to other CPUs and GPUs in the system
+        value |= PI_USM_CONCURRENT_ATOMIC_ACCESS;
+      }
+    }
+    return getInfo(param_value_size, param_value, param_value_size_ret, value);
+  }
+  case PI_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The cross-device shared memory access capabilities apply to any shared
+    // allocation associated with this device, or to any shared memory
+    // allocation on another device that also supports the same cross-device
+    // shared memory access capability."
+    //
+    // query if/how the device can access managed memory associated to other
+    // devices
+    pi_bitfield value = {};
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)) {
+      // the device can allocate managed memory on this system
+      value |= PI_USM_ACCESS;
+    }
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
+      // all devices with the CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+      // attribute can coherently access managed memory concurrently with the
+      // CPU
+      value |= PI_USM_CONCURRENT_ACCESS;
+    }
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR) >=
+        6) {
+      // compute capability 6.x introduces operations that are atomic with
+      // respect to other CPUs and GPUs in the system
+      if (value & PI_USM_ACCESS)
+        value |= PI_USM_ATOMIC_ACCESS;
+      if (value & PI_USM_CONCURRENT_ACCESS)
+        value |= PI_USM_CONCURRENT_ATOMIC_ACCESS;
+    }
+    return getInfo(param_value_size, param_value, param_value_size_ret, value);
+  }
+  case PI_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT: {
+    // from cl_intel_unified_shared_memory:
+    // "The shared system memory access capabilities apply to any allocations
+    // made by a system allocator, such as malloc or new."
+    //
+    // query if/how the device can access pageable host memory allocated by the
+    // system allocator
+    pi_bitfield value = {};
+    if (getAttribute(device, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)) {
+      // the device suppports coherently accessing pageable memory without
+      // calling cuMemHostRegister/cudaHostRegister on it
+      if (getAttribute(device,
+                       CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)) {
+        // the link between the device and the host supports native atomic
+        // operations
+        value = PI_USM_ACCESS | PI_USM_ATOMIC_ACCESS |
+                PI_USM_CONCURRENT_ACCESS | PI_USM_CONCURRENT_ATOMIC_ACCESS;
+      } else {
+        // the link between the device and the host does not support native
+        // atomic operations
+        value = PI_USM_ACCESS | PI_USM_CONCURRENT_ACCESS;
+      }
+    }
+    return getInfo(param_value_size, param_value, param_value_size_ret, value);
+  }
+
   default:
     PI_HANDLE_UNKNOWN_PARAM_NAME(param_name);
   }
@@ -2346,6 +2473,13 @@ pi_result cuda_piKernelSetExecInfo(
   return PI_SUCCESS;
 }
 
+pi_result cuda_piextKernelSetArgPointer(pi_kernel kernel, pi_uint32 arg_index,
+                                        size_t arg_size,
+                                        const void *arg_value) {
+  kernel->set_kernel_arg(arg_index, arg_size, arg_value);
+  return PI_SUCCESS;
+}
+
 //
 // Events
 //
@@ -3009,6 +3143,312 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,
   return ret_err;
 }
 
+// USM
+pi_result cuda_piextUSMHostAlloc(void **result_ptr, pi_context context,
+                                 pi_usm_mem_properties *properties, size_t size,
+                                 pi_uint32 alignment) {
+  assert(result_ptr != nullptr);
+  assert(context != nullptr);
+  assert(properties == nullptr);
+  pi_result result = PI_SUCCESS;
+  try {
+    ScopedContext active(context);
+    result = PI_CHECK_ERROR(cuMemAllocHost(result_ptr, size));
+  } catch (pi_result error) {
+    result = error;
+  }
+  assert(*result_ptr % alignment == 0);
+  return result;
+}
+
+pi_result cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
+                                   pi_device device,
+                                   pi_usm_mem_properties *properties,
+                                   size_t size, pi_uint32 alignment) {
+  assert(result_ptr != nullptr);
+  assert(context != nullptr);
+  assert(device != nullptr);
+  assert(properties == nullptr);
+  pi_result result = PI_SUCCESS;
+  try {
+    ScopedContext active(context);
+    result = PI_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)result_ptr, size));
+  } catch (pi_result error) {
+    result = error;
+  }
+  assert(*result_ptr % alignment == 0);
+  return result;
+}
+
+pi_result cuda_piextUSMSharedAlloc(void **result_ptr, pi_context context,
+                                   pi_device device,
+                                   pi_usm_mem_properties *properties,
+                                   size_t size, pi_uint32 alignment) {
+  assert(result_ptr != nullptr);
+  assert(context != nullptr);
+  assert(device != nullptr);
+  assert(properties == nullptr);
+  pi_result result = PI_SUCCESS;
+  try {
+    ScopedContext active(context);
+    result = PI_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)result_ptr, size,
+                                              CU_MEM_ATTACH_GLOBAL));
+  } catch (pi_result error) {
+    result = error;
+  }
+  assert(*result_ptr % alignment == 0);
+  return result;
+}
+
+pi_result cuda_piextUSMFree(pi_context context, void *ptr) {
+  assert(context != nullptr);
+  pi_result result = PI_SUCCESS;
+  try {
+    ScopedContext active(context);
+    unsigned int type;
+    result = PI_CHECK_ERROR(cuPointerGetAttribute(
+        &type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)ptr));
+    assert(type == CU_MEMORYTYPE_DEVICE or type == CU_MEMORYTYPE_HOST);
+    if (type == CU_MEMORYTYPE_DEVICE) {
+      result = PI_CHECK_ERROR(cuMemFree((CUdeviceptr)ptr));
+    }
+    if (type == CU_MEMORYTYPE_HOST) {
+      result = PI_CHECK_ERROR(cuMemFreeHost(ptr));
+    }
+  } catch (pi_result error) {
+    result = error;
+  }
+  return result;
+}
+
+pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
+                                     size_t count,
+                                     pi_uint32 num_events_in_waitlist,
+                                     const pi_event *events_waitlist,
+                                     pi_event *event) {
+  assert(queue != nullptr);
+  assert(ptr != nullptr);
+  CUstream cuStream = queue->get();
+  pi_result result = PI_SUCCESS;
+  std::unique_ptr<_pi_event> event_ptr{nullptr};
+
+  try {
+    ScopedContext active(queue->get_context());
+    result = cuda_piEnqueueEventsWait(queue, num_events_in_waitlist,
+                                      events_waitlist, nullptr);
+    if (event) {
+      event_ptr = std::unique_ptr<_pi_event>(
+          _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue));
+      event_ptr->start();
+    }
+    result = PI_CHECK_ERROR(cuMemsetD8Async(
+        (CUdeviceptr)ptr, (unsigned char)value & 0xFF, count, cuStream));
+    if (event) {
+      result = event_ptr->record();
+      *event = event_ptr.release();
+    }
+  } catch (pi_result err) {
+    result = err;
+  }
+  return result;
+}
+
+pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking,
+                                     void *dst_ptr, const void *src_ptr,
+                                     size_t size,
+                                     pi_uint32 num_events_in_waitlist,
+                                     const pi_event *events_waitlist,
+                                     pi_event *event) {
+  assert(queue != nullptr);
+  assert(dst_ptr != nullptr);
+  assert(src_ptr != nullptr);
+  CUstream cuStream = queue->get();
+  pi_result result = PI_SUCCESS;
+  std::unique_ptr<_pi_event> event_ptr{nullptr};
+
+  try {
+    ScopedContext active(queue->get_context());
+    result = cuda_piEnqueueEventsWait(queue, num_events_in_waitlist,
+                                      events_waitlist, nullptr);
+    if (event) {
+      event_ptr = std::unique_ptr<_pi_event>(
+          _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue));
+      event_ptr->start();
+    }
+    result = PI_CHECK_ERROR(cuMemcpyAsync(
+        (CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, size, cuStream));
+    if (event) {
+      result = event_ptr->record();
+    }
+    if (blocking) {
+      result = PI_CHECK_ERROR(cuStreamSynchronize(cuStream));
+    }
+    if (event) {
+      *event = event_ptr.release();
+    }
+  } catch (pi_result err) {
+    result = err;
+  }
+  return result;
+}
+
+pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr,
+                                       size_t size,
+                                       pi_usm_migration_flags flags,
+                                       pi_uint32 num_events_in_waitlist,
+                                       const pi_event *events_waitlist,
+                                       pi_event *event) {
+  assert(queue != nullptr);
+  assert(ptr != nullptr);
+  CUstream cuStream = queue->get();
+  pi_result result = PI_SUCCESS;
+  std::unique_ptr<_pi_event> event_ptr{nullptr};
+
+  // TODO implement handling the flags once the expected behaviour
+  // of piextUSMEnqueuePrefetch is detailed in the USM extension
+  assert(flags == 0u);
+
+  try {
+    ScopedContext active(queue->get_context());
+    result = cuda_piEnqueueEventsWait(queue, num_events_in_waitlist,
+                                      events_waitlist, nullptr);
+    if (event) {
+      event_ptr = std::unique_ptr<_pi_event>(
+          _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue));
+      event_ptr->start();
+    }
+    result = PI_CHECK_ERROR(cuMemPrefetchAsync(
+        (CUdeviceptr)ptr, size, queue->get_context()->get_device()->get(),
+        cuStream));
+    if (event) {
+      result = event_ptr->record();
+      *event = event_ptr.release();
+    }
+  } catch (pi_result err) {
+    result = err;
+  }
+  return result;
+}
+
+// USM memadvise API to govern behavior of automatic migration mechanisms
+pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr,
+                                        size_t length, int advice,
+                                        pi_event *event) {
+  assert(queue != nullptr);
+  assert(ptr != nullptr);
+  // TODO implement a mapping to cuMemAdvise once the expected behaviour
+  // of piextUSMEnqueueMemAdvise is detailed in the USM extension
+  return cuda_piEnqueueEventsWait(queue, 0, nullptr, event);
+}
+
+/// API to query information about USM allocated pointers
+/// Valid Queries:
+///   PI_MEM_ALLOC_TYPE returns host/device/shared pi_host_usm value
+///   PI_MEM_ALLOC_BASE_PTR returns the base ptr of an allocation if
+///                         the queried pointer fell inside an allocation.
+///                         Result must fit in void *
+///   PI_MEM_ALLOC_SIZE returns how big the queried pointer's
+///                     allocation is in bytes. Result is a size_t.
+///   PI_MEM_ALLOC_DEVICE returns the pi_device this was allocated against
+///
+/// \param context is the pi_context
+/// \param ptr is the pointer to query
+/// \param param_name is the type of query to perform
+/// \param param_value_size is the size of the result in bytes
+/// \param param_value is the result
+/// \param param_value_ret is how many bytes were written
+pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr,
+                                       pi_mem_info param_name,
+                                       size_t param_value_size,
+                                       void *param_value,
+                                       size_t *param_value_size_ret) {
+  assert(context != nullptr);
+  assert(ptr != nullptr);
+  pi_result result = PI_SUCCESS;
+
+  try {
+    ScopedContext active(context);
+    switch (param_name) {
+    case PI_MEM_ALLOC_TYPE: {
+      unsigned int value;
+      // do not throw if cuPointerGetAttribute returns CUDA_ERROR_INVALID_VALUE
+      CUresult ret = cuPointerGetAttribute(
+          &value, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr);
+      if (ret == CUDA_ERROR_INVALID_VALUE) {
+        // pointer not known to the CUDA subsystem
+        return getInfo(param_value_size, param_value, param_value_size_ret,
+                       PI_MEM_TYPE_UNKNOWN);
+      }
+      result = check_error(ret, __func__, __LINE__ - 5, __FILE__);
+      if (value) {
+        // pointer to managed memory
+        return getInfo(param_value_size, param_value, param_value_size_ret,
+                       PI_MEM_TYPE_SHARED);
+      }
+      result = PI_CHECK_ERROR(cuPointerGetAttribute(
+          &value, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)ptr));
+      assert(value == CU_MEMORYTYPE_DEVICE or value == CU_MEMORYTYPE_HOST);
+      if (value == CU_MEMORYTYPE_DEVICE) {
+        // pointer to device memory
+        return getInfo(param_value_size, param_value, param_value_size_ret,
+                       PI_MEM_TYPE_DEVICE);
+      }
+      if (value == CU_MEMORYTYPE_HOST) {
+        // pointer to host memory
+        return getInfo(param_value_size, param_value, param_value_size_ret,
+                       PI_MEM_TYPE_HOST);
+      }
+      // should never get here
+      __builtin_unreachable();
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     PI_MEM_TYPE_UNKNOWN);
+    }
+    case PI_MEM_ALLOC_BASE_PTR: {
+#if __CUDA_API_VERSION >= 10020
+      // CU_POINTER_ATTRIBUTE_RANGE_START_ADDR was introduced in CUDA 10.2
+      unsigned int value;
+      result = PI_CHECK_ERROR(cuPointerGetAttribute(
+          &value, CU_POINTER_ATTRIBUTE_RANGE_START_ADDR, (CUdeviceptr)ptr));
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     value);
+#else
+      return PI_INVALID_VALUE;
+#endif
+    }
+    case PI_MEM_ALLOC_SIZE: {
+#if __CUDA_API_VERSION >= 10020
+      // CU_POINTER_ATTRIBUTE_RANGE_SIZE was introduced in CUDA 10.2
+      unsigned int value;
+      result = PI_CHECK_ERROR(cuPointerGetAttribute(
+          &value, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)ptr));
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     value);
+#else
+      return PI_INVALID_VALUE;
+#endif
+    }
+    case PI_MEM_ALLOC_DEVICE: {
+      unsigned int value;
+      result = PI_CHECK_ERROR(cuPointerGetAttribute(
+          &value, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)ptr));
+      pi_platform platform;
+      result = cuda_piPlatformsGet(0, &platform, nullptr);
+      pi_device device = platform->devices_[value].get();
+      return getInfo(param_value_size, param_value, param_value_size_ret,
+                     device);
+    }
+    // not documented/implemented yet
+    case PI_MEM_ALLOC_INFO_TBD0:
+    case PI_MEM_ALLOC_INFO_TBD1: {
+      return PI_INVALID_VALUE;
+    }
+    }
+  } catch (pi_result error) {
+    result = error;
+  }
+  return result;
+}
+
 const char SupportedVersion[] = _PI_H_VERSION_STRING;
 
 pi_result piPluginInit(pi_plugin *PluginInit) {
@@ -3027,7 +3467,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   std::memset(&(PluginInit->PiFunctionTable), 0,
               sizeof(PluginInit->PiFunctionTable));
 
-// Forward calls to OpenCL RT.
+// Forward calls to CUDA RT.
 #define _PI_CL(pi_api, cuda_api)                                         \
   (PluginInit->PiFunctionTable).pi_api = (decltype(&::pi_api))(&cuda_api);
 
@@ -3081,7 +3521,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piKernelRetain, cuda_piKernelRetain)
   _PI_CL(piKernelRelease, cuda_piKernelRelease)
   _PI_CL(piKernelSetExecInfo, cuda_piKernelSetExecInfo)
-
+  _PI_CL(piextKernelSetArgPointer, cuda_piextKernelSetArgPointer)
   // Event
   _PI_CL(piEventCreate, cuda_piEventCreate)
   _PI_CL(piEventGetInfo, cuda_piEventGetInfo)
@@ -3113,6 +3553,16 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
   _PI_CL(piEnqueueMemImageFill, cuda_piEnqueueMemImageFill)
   _PI_CL(piEnqueueMemBufferMap, cuda_piEnqueueMemBufferMap)
   _PI_CL(piEnqueueMemUnmap, cuda_piEnqueueMemUnmap)
+  // USM
+  _PI_CL(piextUSMHostAlloc, cuda_piextUSMHostAlloc)
+  _PI_CL(piextUSMDeviceAlloc, cuda_piextUSMDeviceAlloc)
+  _PI_CL(piextUSMSharedAlloc, cuda_piextUSMSharedAlloc)
+  _PI_CL(piextUSMFree, cuda_piextUSMFree)
+  _PI_CL(piextUSMEnqueueMemset, cuda_piextUSMEnqueueMemset)
+  _PI_CL(piextUSMEnqueueMemcpy, cuda_piextUSMEnqueueMemcpy)
+  _PI_CL(piextUSMEnqueuePrefetch, cuda_piextUSMEnqueuePrefetch)
+  _PI_CL(piextUSMEnqueueMemAdvise, cuda_piextUSMEnqueueMemAdvise)
+  _PI_CL(piextUSMGetMemAllocInfo, cuda_piextUSMGetMemAllocInfo)
 
   _PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj)
 
diff --git a/sycl/test/usm/allocator_vector.cpp b/sycl/test/usm/allocator_vector.cpp
index 533f00b38db0a..9cc82d8fac1f5 100644
--- a/sycl/test/usm/allocator_vector.cpp
+++ b/sycl/test/usm/allocator_vector.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==---- allocator_vector.cpp - Allocator Container test -------------------==//
 //
diff --git a/sycl/test/usm/allocator_vector_fail.cpp b/sycl/test/usm/allocator_vector_fail.cpp
index f77729f14b6d0..bb033ef753071 100644
--- a/sycl/test/usm/allocator_vector_fail.cpp
+++ b/sycl/test/usm/allocator_vector_fail.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==-- allocator_vector_fail.cpp - Device Memory Allocator fail test -------==//
 //
diff --git a/sycl/test/usm/allocatorll.cpp b/sycl/test/usm/allocatorll.cpp
index dec3c4ff837d7..1b7796540686f 100644
--- a/sycl/test/usm/allocatorll.cpp
+++ b/sycl/test/usm/allocatorll.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==---- allocatorll.cpp - Device Memory Linked List Allocator test --------==//
 //
diff --git a/sycl/test/usm/badmalloc.cpp b/sycl/test/usm/badmalloc.cpp
index b99f1f50663cf..fc91b1260d465 100644
--- a/sycl/test/usm/badmalloc.cpp
+++ b/sycl/test/usm/badmalloc.cpp
@@ -4,7 +4,6 @@
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
 
 // UNSUPPORTED: windows
-// XFAIL: cuda
 
 //==----------------- badmalloc.cpp - Bad Mallocs test ---------------------==//
 //
diff --git a/sycl/test/usm/depends_on.cpp b/sycl/test/usm/depends_on.cpp
index f4ce565803e31..3943621853836 100644
--- a/sycl/test/usm/depends_on.cpp
+++ b/sycl/test/usm/depends_on.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==----------------- depends_on.cpp - depends_on test ---------------------==//
 //
diff --git a/sycl/test/usm/dmemll.cpp b/sycl/test/usm/dmemll.cpp
index 3236e36344e3c..e5c32b2f20262 100644
--- a/sycl/test/usm/dmemll.cpp
+++ b/sycl/test/usm/dmemll.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==------------------- dmemll.cpp - Device Memory Linked List test --------==//
 //
diff --git a/sycl/test/usm/dmemllaligned.cpp b/sycl/test/usm/dmemllaligned.cpp
index d67131839b242..b45e3a5b204cc 100644
--- a/sycl/test/usm/dmemllaligned.cpp
+++ b/sycl/test/usm/dmemllaligned.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==---- dmemllaligned.cpp - Aligned Device Memory Linked List test --------==//
 //
diff --git a/sycl/test/usm/hmemll.cpp b/sycl/test/usm/hmemll.cpp
index 18db63d192581..38b578ce948c2 100644
--- a/sycl/test/usm/hmemll.cpp
+++ b/sycl/test/usm/hmemll.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==------------------- hmemll.cpp - Host Memory Linked List test ----------==//
 //
diff --git a/sycl/test/usm/hmemllaligned.cpp b/sycl/test/usm/hmemllaligned.cpp
index 7ee2d6cda5fdf..a86abcecd33fa 100644
--- a/sycl/test/usm/hmemllaligned.cpp
+++ b/sycl/test/usm/hmemllaligned.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==---- hmemllaligned.cpp - Aligned Host Memory Linked List test ----------==//
 //
diff --git a/sycl/test/usm/memadvise.cpp b/sycl/test/usm/memadvise.cpp
index 111519169c7ac..9b584c045e2e5 100644
--- a/sycl/test/usm/memadvise.cpp
+++ b/sycl/test/usm/memadvise.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==---------------- memadvise.cpp - Shared Memory Linked List test --------==//
 //
diff --git a/sycl/test/usm/memcpy.cpp b/sycl/test/usm/memcpy.cpp
index e5871374ea3c2..3545cdf5218fd 100644
--- a/sycl/test/usm/memcpy.cpp
+++ b/sycl/test/usm/memcpy.cpp
@@ -8,7 +8,6 @@
 // RUN: %clangxx -fsycl %s -o %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 #include <CL/sycl.hpp>
 
diff --git a/sycl/test/usm/memset.cpp b/sycl/test/usm/memset.cpp
index 4e01415073f6d..6fb12eb1fcc4d 100644
--- a/sycl/test/usm/memset.cpp
+++ b/sycl/test/usm/memset.cpp
@@ -1,3 +1,7 @@
+// RUN: %clangxx -fsycl %s -o %t1.out
+// RUN: %CPU_RUN_PLACEHOLDER %t1.out
+// RUN: %GPU_RUN_PLACEHOLDER %t1.out
+
 //==---- memset.cpp - USM memset test --------------------------------------==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -5,10 +9,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// RUN: %clangxx -fsycl %s -o %t1.out
-// RUN: %CPU_RUN_PLACEHOLDER %t1.out
-// RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 #include <CL/sycl.hpp>
 
diff --git a/sycl/test/usm/mixed.cpp b/sycl/test/usm/mixed.cpp
index d068fccf8c812..5d45182d2a35e 100644
--- a/sycl/test/usm/mixed.cpp
+++ b/sycl/test/usm/mixed.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==------------------- mixed.cpp - Mixed Memory test ---------------------==//
 //
diff --git a/sycl/test/usm/mixed2.cpp b/sycl/test/usm/mixed2.cpp
index f2b6b79d07a0e..c074e2207b578 100644
--- a/sycl/test/usm/mixed2.cpp
+++ b/sycl/test/usm/mixed2.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==------------------- mixed2.cpp - Mixed Memory test ---------------------==//
 //
diff --git a/sycl/test/usm/mixed_queue.cpp b/sycl/test/usm/mixed_queue.cpp
index f17e6bc6e214d..0585e982179e1 100644
--- a/sycl/test/usm/mixed_queue.cpp
+++ b/sycl/test/usm/mixed_queue.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==-------------- mixed_queue.cpp - Mixed Memory test ---------------------==//
 //
diff --git a/sycl/test/usm/smemll.cpp b/sycl/test/usm/smemll.cpp
index d2a6c3a2d8e2d..4fb79cb8429d8 100644
--- a/sycl/test/usm/smemll.cpp
+++ b/sycl/test/usm/smemll.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==------------------- smemll.cpp - Shared Memory Linked List test --------==//
 //
diff --git a/sycl/test/usm/smemllaligned.cpp b/sycl/test/usm/smemllaligned.cpp
index 0c012b978d028..0d5eeb8aca7de 100644
--- a/sycl/test/usm/smemllaligned.cpp
+++ b/sycl/test/usm/smemllaligned.cpp
@@ -2,7 +2,6 @@
 // RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
 // RUN: %CPU_RUN_PLACEHOLDER %t1.out
 // RUN: %GPU_RUN_PLACEHOLDER %t1.out
-// XFAIL: cuda
 
 //==---- smemllaligned.cpp - Aligned Shared Memory Linked List test --------==//
 //