diff --git a/3rdparty/embree/embree.cmake b/3rdparty/embree/embree.cmake
index 2495c987bbd..6d55198ec69 100644
--- a/3rdparty/embree/embree.cmake
+++ b/3rdparty/embree/embree.cmake
@@ -64,6 +64,16 @@ else()
 endif()
 
 
+if(BUILD_SYCL_MODULE)
+    set(ISA_ARGS ${ISA_ARGS} -DCMAKE_CXX_COMPILER=icpx)
+    set(ISA_ARGS ${ISA_ARGS} -DCMAKE_C_COMPILER=icx)
+    set(ISA_ARGS ${ISA_ARGS} -DEMBREE_SYCL_SUPPORT=ON)
+    list(APPEND ISA_LIBS embree4_sycl ze_wrapper)
+    list(APPEND ISA_BUILD_BYPRODUCTS "<INSTALL_DIR>/${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}embree4_sycl${CMAKE_STATIC_LIBRARY_SUFFIX}"
+                                    "<INSTALL_DIR>/${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}ze_wrapper${CMAKE_STATIC_LIBRARY_SUFFIX}")
+endif()
+
+
 ExternalProject_Add(
     ext_embree
     PREFIX embree
@@ -92,10 +102,11 @@ ExternalProject_Add(
         <INSTALL_DIR>/${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}sys${CMAKE_STATIC_LIBRARY_SUFFIX}
         <INSTALL_DIR>/${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}math${CMAKE_STATIC_LIBRARY_SUFFIX}
         <INSTALL_DIR>/${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}tasking${CMAKE_STATIC_LIBRARY_SUFFIX}
+        <INSTALL_DIR>/${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}ze_wrapper${CMAKE_STATIC_LIBRARY_SUFFIX}
         ${ISA_BUILD_BYPRODUCTS}
 )
 
 ExternalProject_Get_Property(ext_embree INSTALL_DIR)
 set(EMBREE_INCLUDE_DIRS ${INSTALL_DIR}/include/ ${INSTALL_DIR}/src/ext_embree/) # "/" is critical.
 set(EMBREE_LIB_DIR ${INSTALL_DIR}/${Open3D_INSTALL_LIB_DIR})
-set(EMBREE_LIBRARIES embree4 ${ISA_LIBS} simd lexers sys math tasking)
+set(EMBREE_LIBRARIES embree4 simd lexers sys math tasking ${ISA_LIBS})
diff --git a/3rdparty/find_dependencies.cmake b/3rdparty/find_dependencies.cmake
index c3dc3885565..d2afbe405c9 100644
--- a/3rdparty/find_dependencies.cmake
+++ b/3rdparty/find_dependencies.cmake
@@ -1384,7 +1384,7 @@ if(BUILD_GUI)
                                   ${CPP_LIBRARY} ${CPPABI_LIBRARY})
             message(STATUS "Filament C++ libraries: ${CPP_LIBRARY} ${CPPABI_LIBRARY}")
             if (LIBCPP_VERSION GREATER 11)
-                message(WARNING "libc++ (LLVM) version ${LIBCPP_VERSION} > 11 includes libunwind that " 
+                message(WARNING "libc++ (LLVM) version ${LIBCPP_VERSION} > 11 includes libunwind that "
                 "interferes with the system libunwind.so.8 and may crash Python code when exceptions "
                 "are used. Please consider using libc++ (LLVM) v11.")
             endif()
@@ -1697,7 +1697,7 @@ else(OPEN3D_USE_ONEAPI_PACKAGES)
             INCLUDE_DIRS ${STATIC_MKL_INCLUDE_DIR}
             LIB_DIR      ${STATIC_MKL_LIB_DIR}
             LIBRARIES    ${STATIC_MKL_LIBRARIES}
-            DEPENDS      ext_tbb ext_mkl_include ext_mkl
+            DEPENDS      Open3D::3rdparty_tbb ext_mkl_include ext_mkl
         )
         if(UNIX)
             target_compile_options(3rdparty_blas INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-m64>")
@@ -1719,7 +1719,7 @@ else(OPEN3D_USE_ONEAPI_PACKAGES)
     endif()
     if(NOT USE_SYSTEM_TBB)
         include(${Open3D_3RDPARTY_DIR}/mkl/tbb.cmake)
-        list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM TBB::tbb)
+        list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_tbb)
     else()
         list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_tbb)
     endif()
diff --git a/3rdparty/mkl/tbb.cmake b/3rdparty/mkl/tbb.cmake
index 385af4b644b..a36435ee1b4 100644
--- a/3rdparty/mkl/tbb.cmake
+++ b/3rdparty/mkl/tbb.cmake
@@ -38,3 +38,4 @@ install(TARGETS tbb EXPORT ${PROJECT_NAME}Targets
   RUNTIME DESTINATION ${Open3D_INSTALL_BIN_DIR}
   COMPONENT tbb
 )
+add_library(${PROJECT_NAME}::3rdparty_tbb ALIAS tbb)
diff --git a/cpp/open3d/core/Device.cpp b/cpp/open3d/core/Device.cpp
index 7137e307874..2681fe8dc79 100644
--- a/cpp/open3d/core/Device.cpp
+++ b/cpp/open3d/core/Device.cpp
@@ -137,7 +137,7 @@ std::vector<Device> Device::GetAvailableCUDADevices() {
 }
 
 std::vector<Device> Device::GetAvailableSYCLDevices() {
-    return sycl::GetAvailableSYCLDevices();
+    return sy::GetAvailableSYCLDevices();
 }
 
 void Device::PrintAvailableDevices() {
diff --git a/cpp/open3d/core/Device.h b/cpp/open3d/core/Device.h
index 5b04875ed20..13d6715f025 100644
--- a/cpp/open3d/core/Device.h
+++ b/cpp/open3d/core/Device.h
@@ -21,7 +21,7 @@ class Device {
     enum class DeviceType {
         CPU = 0,
         CUDA = 1,
-        SYCL = 2,  // SYCL gpu_selector().
+        SYCL = 2,  // SYCL gpu_selector_v.
     };
 
     /// Default constructor -> "CPU:0".
diff --git a/cpp/open3d/core/MemoryManagerSYCL.cpp b/cpp/open3d/core/MemoryManagerSYCL.cpp
index c3d642a2c76..cfa92ffc804 100644
--- a/cpp/open3d/core/MemoryManagerSYCL.cpp
+++ b/cpp/open3d/core/MemoryManagerSYCL.cpp
@@ -5,8 +5,8 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
-#include <CL/sycl.hpp>
 #include <cstdlib>
+#include <sycl/sycl.hpp>
 #include <unordered_map>
 
 #include "open3d/core/Device.h"
@@ -18,24 +18,22 @@
 namespace open3d {
 namespace core {
 
-namespace sy = cl::sycl;
-
 void* MemoryManagerSYCL::Malloc(size_t byte_size, const Device& device) {
-    const sy::queue& queue =
-            sycl::SYCLContext::GetInstance().GetDefaultQueue(device);
+    const sycl::queue& queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(device);
 
 #ifdef ENABLE_SYCL_UNIFIED_SHARED_MEMORY
-    return static_cast<void*>(sy::malloc_shared(byte_size, queue));
+    return static_cast<void*>(sycl::malloc_shared(byte_size, queue));
 #else
-    return static_cast<void*>(sy::malloc_device(byte_size, queue));
+    return static_cast<void*>(sycl::malloc_device(byte_size, queue));
 #endif
 }
 
 void MemoryManagerSYCL::Free(void* ptr, const Device& device) {
     if (ptr) {
-        const sy::queue& queue =
-                sycl::SYCLContext::GetInstance().GetDefaultQueue(device);
-        sy::free(ptr, queue);
+        const sycl::queue& queue =
+                sy::SYCLContext::GetInstance().GetDefaultQueue(device);
+        sycl::free(ptr, queue);
     }
 }
 
@@ -62,8 +60,8 @@ void MemoryManagerSYCL::Memcpy(void* dst_ptr,
                           dst_device.ToString());
     }
 
-    sy::queue queue =
-            sycl::SYCLContext::GetInstance().GetDefaultQueue(device_with_queue);
+    sycl::queue queue =
+            sy::SYCLContext::GetInstance().GetDefaultQueue(device_with_queue);
     queue.memcpy(dst_ptr, src_ptr, num_bytes).wait_and_throw();
 }
 
diff --git a/cpp/open3d/core/SYCLContext.cpp b/cpp/open3d/core/SYCLContext.cpp
index 660ab8d2bab..6a3007d7dba 100644
--- a/cpp/open3d/core/SYCLContext.cpp
+++ b/cpp/open3d/core/SYCLContext.cpp
@@ -7,17 +7,17 @@
 
 #include "open3d/core/SYCLContext.h"
 
-#include <CL/sycl.hpp>
 #include <array>
 #include <cstdlib>
 #include <sstream>
+#include <sycl/sycl.hpp>
 
 #include "open3d/core/SYCLUtils.h"
 #include "open3d/utility/Logging.h"
 
 namespace open3d {
 namespace core {
-namespace sycl {
+namespace sy {
 
 SYCLContext &SYCLContext::GetInstance() {
     static thread_local SYCLContext instance;
@@ -38,7 +38,7 @@ bool SYCLContext::IsDeviceAvailable(const Device &device) {
 }
 std::vector<Device> SYCLContext::GetAvailableSYCLDevices() { return devices_; }
 
-sy::queue SYCLContext::GetDefaultQueue(const Device &device) {
+sycl::queue SYCLContext::GetDefaultQueue(const Device &device) {
     return device_to_default_queue_.at(device);
 }
 
@@ -46,12 +46,12 @@ SYCLContext::SYCLContext() {
     // SYCL GPU.
     // TODO: Currently we only support one GPU device.
     try {
-        const sy::device &sycl_device = sy::device(sy::gpu_selector());
+        const sycl::device &sycl_device = sycl::device(sycl::gpu_selector_v);
         const Device open3d_device = Device("SYCL:0");
         devices_.push_back(open3d_device);
         device_to_sycl_device_[open3d_device] = sycl_device;
-        device_to_default_queue_[open3d_device] = sy::queue(sycl_device);
-    } catch (const sy::exception &e) {
+        device_to_default_queue_[open3d_device] = sycl::queue(sycl_device);
+    } catch (const sycl::exception &e) {
     }
 
     if (devices_.size() == 0) {
@@ -59,15 +59,16 @@ SYCLContext::SYCLContext() {
         // This could happen if the Intel GPGPU driver is not installed or if
         // your CPU does not have integrated GPU.
         try {
-            const sy::device &sycl_device = sy::device(sy::host_selector());
+            const sycl::device &sycl_device =
+                    sycl::device(sycl::cpu_selector_v);
             const Device open3d_device = Device("SYCL:0");
             utility::LogWarning(
                     "SYCL GPU device is not available, falling back to SYCL "
                     "host device.");
             devices_.push_back(open3d_device);
             device_to_sycl_device_[open3d_device] = sycl_device;
-            device_to_default_queue_[open3d_device] = sy::queue(sycl_device);
-        } catch (const sy::exception &e) {
+            device_to_default_queue_[open3d_device] = sycl::queue(sycl_device);
+        } catch (const sycl::exception &e) {
         }
     }
 
@@ -76,6 +77,6 @@ SYCLContext::SYCLContext() {
     }
 }
 
-}  // namespace sycl
+}  // namespace sy
 }  // namespace core
 }  // namespace open3d
diff --git a/cpp/open3d/core/SYCLContext.h b/cpp/open3d/core/SYCLContext.h
index e23b9d8770c..b44e2e3fb96 100644
--- a/cpp/open3d/core/SYCLContext.h
+++ b/cpp/open3d/core/SYCLContext.h
@@ -14,16 +14,14 @@
 
 #pragma once
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 #include <unordered_map>
 
 #include "open3d/core/Device.h"
 
 namespace open3d {
 namespace core {
-namespace sycl {
-
-namespace sy = cl::sycl;
+namespace sy {
 
 /// Singleton SYCL context manager. It maintains:
 /// - A default queue for each SYCL device
@@ -45,7 +43,7 @@ class SYCLContext {
     std::vector<Device> GetAvailableSYCLDevices();
 
     /// Get the default SYCL queue given an Open3D device.
-    sy::queue GetDefaultQueue(const Device& device);
+    sycl::queue GetDefaultQueue(const Device& device);
 
 private:
     SYCLContext();
@@ -54,12 +52,12 @@ class SYCLContext {
     std::vector<Device> devices_;
 
     /// Maps core::Device to the corresponding default SYCL queue.
-    std::unordered_map<Device, sy::queue> device_to_default_queue_;
+    std::unordered_map<Device, sycl::queue> device_to_default_queue_;
 
-    /// Maps core::Device to sy::device. Internal use only for now.
-    std::unordered_map<Device, sy::device> device_to_sycl_device_;
+    /// Maps core::Device to sycl::device. Internal use only for now.
+    std::unordered_map<Device, sycl::device> device_to_sycl_device_;
 };
 
-}  // namespace sycl
+}  // namespace sy
 }  // namespace core
 }  // namespace open3d
diff --git a/cpp/open3d/core/SYCLUtils.cpp b/cpp/open3d/core/SYCLUtils.cpp
index d9e42fdcde0..5860021996d 100644
--- a/cpp/open3d/core/SYCLUtils.cpp
+++ b/cpp/open3d/core/SYCLUtils.cpp
@@ -22,45 +22,42 @@
 #include "open3d/utility/Logging.h"
 
 #ifdef BUILD_SYCL_MODULE
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 #include "open3d/core/SYCLContext.h"
 #endif
 
 namespace open3d {
 namespace core {
-namespace sycl {
-
-#ifdef BUILD_SYCL_MODULE
-namespace sy = cl::sycl;
-#endif
+namespace sy {
 
 int SYCLDemo() {
 #ifdef BUILD_SYCL_MODULE
     // Ref: https://intel.github.io/llvm-docs/GetStartedGuide.html
     // Creating buffer of 4 ints to be used inside the kernel code.
-    sy::buffer<sy::cl_int, 1> buffer(4);
+    sycl::buffer<int, 1> buffer(4);
 
     // Creating SYCL queue.
-    sy::queue q;
+    sycl::queue q;
 
     // Size of index space for kernel.
-    sy::range<1> num_workloads{buffer.size()};
+    sycl::range<1> num_workloads{buffer.size()};
 
     // Submitting command group(work) to q.
-    q.submit([&](sy::handler &cgh) {
+    q.submit([&](sycl::handler &cgh) {
         // Getting write only access to the buffer on a device.
-        auto accessor = buffer.get_access<sy::access::mode::write>(cgh);
+        auto accessor = buffer.get_access<sycl::access::mode::write>(cgh);
         // Execute kernel.
-        cgh.parallel_for<class FillBuffer>(num_workloads, [=](sy::id<1> WIid) {
-            // Fill buffer with indexes.
-            accessor[WIid] = (sy::cl_int)WIid.get(0);
-        });
+        cgh.parallel_for<class FillBuffer>(
+                num_workloads, [=](sycl::id<1> WIid) {
+                    // Fill buffer with indexes.
+                    accessor[WIid] = (int)WIid.get(0);
+                });
     });
 
-    // Getting read only access to the buffer on the host.
+    // Getting access to the buffer on the host.
     // Implicit barrier waiting for q to complete the work.
-    const auto host_accessor = buffer.get_access<sy::access::mode::read>();
+    const auto host_accessor = buffer.get_host_access();
 
     // Check the results.
     bool mismatch_found = false;
@@ -87,34 +84,34 @@ int SYCLDemo() {
 
 #ifdef BUILD_SYCL_MODULE
 
-static std::string GetDeviceTypeName(const sy::device &device) {
-    auto device_type = device.get_info<sy::info::device::device_type>();
+static std::string GetDeviceTypeName(const sycl::device &device) {
+    auto device_type = device.get_info<sycl::info::device::device_type>();
     switch (device_type) {
-        case sy::info::device_type::cpu:
+        case sycl::info::device_type::cpu:
             return "cpu";
-        case sy::info::device_type::gpu:
+        case sycl::info::device_type::gpu:
             return "gpu";
-        case sy::info::device_type::host:
+        case sycl::info::device_type::host:
             return "host";
-        case sy::info::device_type::accelerator:
+        case sycl::info::device_type::accelerator:
             return "acc";
         default:
             return "unknown";
     }
 }
 
-static std::string GetBackendName(const sy::device &device) {
-    sy::platform platform = device.get_info<sy::info::device::platform>();
-    sy::backend backend = platform.get_backend();
+static std::string GetBackendName(const sycl::device &device) {
+    sycl::platform platform = device.get_info<sycl::info::device::platform>();
+    sycl::backend backend = platform.get_backend();
     std::ostringstream os;
     os << backend;
     return os.str();
 }
 
-static std::string SYCLDeviceToString(const sy::device &device) {
+static std::string SYCLDeviceToString(const sycl::device &device) {
     std::ostringstream os;
     os << "[" << GetBackendName(device) << ":" << GetDeviceTypeName(device)
-       << "] " << device.get_info<sy::info::device::name>();
+       << "] " << device.get_info<sycl::info::device::name>();
     return os.str();
 }
 #endif
@@ -131,68 +128,62 @@ void PrintSYCLDevices(bool print_all) {
 
     if (print_all) {
         utility::LogInfo("# All SYCL devices");
-        const std::vector<sy::platform> &platforms =
-                sy::platform::get_platforms();
-        for (const sy::platform &platform : platforms) {
-            sy::backend backend = platform.get_backend();
-            const std::vector<sy::device> &devices = platform.get_devices();
-            for (const sy::device &device : devices) {
+        const std::vector<sycl::platform> &platforms =
+                sycl::platform::get_platforms();
+        for (const sycl::platform &platform : platforms) {
+            sycl::backend backend = platform.get_backend();
+            const std::vector<sycl::device> &devices = platform.get_devices();
+            for (const sycl::device &device : devices) {
                 utility::LogInfo("- {}", SYCLDeviceToString(device));
             }
         }
 
         utility::LogInfo("# Default SYCL selectors");
         try {
-            const sy::device &device = sy::device(sy::default_selector());
-            utility::LogInfo("- sycl::default_selector()    : {}",
+            const sycl::device &device = sycl::device(sycl::default_selector_v);
+            utility::LogInfo("- sycl::default_selector_v    : {}",
                              SYCLDeviceToString(device));
-        } catch (const sy::exception &e) {
-            utility::LogInfo("- sycl::default_selector()    : N/A");
+        } catch (const sycl::exception &e) {
+            utility::LogInfo("- sycl::default_selector_v    : N/A");
         }
         try {
-            const sy::device &device = sy::device(sy::host_selector());
-            utility::LogInfo("- sycl::host_selector()       : {}",
+            const sycl::device &device = sycl::device(sycl::cpu_selector_v);
+            utility::LogInfo("- sycl::cpu_selector_v        : {}",
                              SYCLDeviceToString(device));
-        } catch (const sy::exception &e) {
-            utility::LogInfo("- sycl::host_selector()       : N/A");
+        } catch (const sycl::exception &e) {
+            utility::LogInfo("- sycl::cpu_selector_v        : N/A");
         }
         try {
-            const sy::device &device = sy::device(sy::cpu_selector());
-            utility::LogInfo("- sycl::cpu_selector()        : {}",
+            const sycl::device &device = sycl::device(sycl::gpu_selector_v);
+            utility::LogInfo("- sycl::gpu_selector_v        : {}",
                              SYCLDeviceToString(device));
-        } catch (const sy::exception &e) {
-            utility::LogInfo("- sycl::cpu_selector()        : N/A");
+        } catch (const sycl::exception &e) {
+            utility::LogInfo("- sycl::gpu_selector_v        : N/A");
         }
         try {
-            const sy::device &device = sy::device(sy::gpu_selector());
-            utility::LogInfo("- sycl::gpu_selector()        : {}",
+            const sycl::device &device =
+                    sycl::device(sycl::accelerator_selector_v);
+            utility::LogInfo("- sycl::accelerator_selector_v: {}",
                              SYCLDeviceToString(device));
-        } catch (const sy::exception &e) {
-            utility::LogInfo("- sycl::gpu_selector()        : N/A");
-        }
-        try {
-            const sy::device &device = sy::device(sy::accelerator_selector());
-            utility::LogInfo("- sycl::accelerator_selector(): {}",
-                             SYCLDeviceToString(device));
-        } catch (const sy::exception &e) {
-            utility::LogInfo("- sycl::accelerator_selector(): N/A");
+        } catch (const sycl::exception &e) {
+            utility::LogInfo("- sycl::accelerator_selector_v: N/A");
         }
 
         utility::LogInfo("# Open3D SYCL device");
         try {
-            const sy::device &device = sy::device(sy::gpu_selector());
+            const sycl::device &device = sycl::device(sycl::gpu_selector_v);
             utility::LogInfo("- Device(\"SYCL:0\"): {}",
                              SYCLDeviceToString(device));
-        } catch (const sy::exception &e) {
+        } catch (const sycl::exception &e) {
             utility::LogInfo("- Device(\"SYCL:0\"): N/A");
         }
     } else {
         utility::LogInfo("# Open3D SYCL device");
         try {
-            const sy::device &device = sy::device(sy::gpu_selector());
+            const sycl::device &device = sycl::device(sycl::gpu_selector_v);
             utility::LogInfo("- Device(\"SYCL:0\"): {}",
                              SYCLDeviceToString(device));
-        } catch (const sy::exception &e) {
+        } catch (const sycl::exception &e) {
             utility::LogInfo("- Device(\"SYCL:0\"): N/A");
         }
     }
@@ -227,6 +218,19 @@ std::vector<Device> GetAvailableSYCLDevices() {
 #endif
 }
 
-}  // namespace sycl
+void enablePersistentJITCache() {
+#ifdef BUILD_SYCL_MODULE
+#if defined(_WIN32)
+    _putenv_s("SYCL_CACHE_PERSISTENT", "1");
+#else
+    setenv("SYCL_CACHE_PERSISTENT", "1", 1);
+#endif
+#endif
+    utility::LogInfo(
+            "enablePersistentJITCache is not compiled with "
+            "BUILD_SYCL_MODULE=ON.");
+}
+
+}  // namespace sy
 }  // namespace core
 }  // namespace open3d
diff --git a/cpp/open3d/core/SYCLUtils.h b/cpp/open3d/core/SYCLUtils.h
index 385523f3800..8a38ad662d0 100644
--- a/cpp/open3d/core/SYCLUtils.h
+++ b/cpp/open3d/core/SYCLUtils.h
@@ -19,7 +19,7 @@
 
 namespace open3d {
 namespace core {
-namespace sycl {
+namespace sy {
 
 /// Runs simple SYCL test program for sanity checks.
 /// \return Retuns 0 if successful.
@@ -40,6 +40,10 @@ bool IsDeviceAvailable(const Device& device);
 /// Return a list of available SYCL devices.
 std::vector<Device> GetAvailableSYCLDevices();
 
-}  // namespace sycl
+/// Enables the JIT cache for SYCL. This sets an environment variable and will
+/// affect the entire process and any child processes.
+void enablePersistentJITCache();
+
+}  // namespace sy
 }  // namespace core
 }  // namespace open3d
diff --git a/cpp/open3d/t/geometry/CMakeLists.txt b/cpp/open3d/t/geometry/CMakeLists.txt
index 1b1ba5760cf..e345c819b6f 100644
--- a/cpp/open3d/t/geometry/CMakeLists.txt
+++ b/cpp/open3d/t/geometry/CMakeLists.txt
@@ -7,7 +7,6 @@ target_sources(tgeometry PRIVATE
     LineSet.cpp
     BoundingVolume.cpp
     PointCloud.cpp
-    RaycastingScene.cpp
     RGBDImage.cpp
     TensorMap.cpp
     TriangleMesh.cpp
@@ -16,6 +15,16 @@ target_sources(tgeometry PRIVATE
     VtkUtils.cpp
 )
 
+if (BUILD_SYCL_MODULE)
+    open3d_sycl_target_sources(tgeometry PRIVATE
+        RaycastingScene.cpp
+    )
+else()
+    target_sources(tgeometry PRIVATE
+        RaycastingScene.cpp
+    )
+endif()
+
 open3d_show_and_abort_on_warning(tgeometry)
 open3d_set_global_properties(tgeometry)
 open3d_set_open3d_lib_properties(tgeometry)
diff --git a/cpp/open3d/t/geometry/RaycastingScene.cpp b/cpp/open3d/t/geometry/RaycastingScene.cpp
index 12f082d0ee1..444f4f7a5ff 100644
--- a/cpp/open3d/t/geometry/RaycastingScene.cpp
+++ b/cpp/open3d/t/geometry/RaycastingScene.cpp
@@ -11,11 +11,15 @@
 #endif
 #include "open3d/t/geometry/RaycastingScene.h"
 
+#ifdef BUILD_SYCL_MODULE
+#include <sycl/sycl.hpp>
+#endif
 // This header is in the embree src dir (embree/src/ext_embree/..).
 #include <embree4/rtcore.h>
 #include <tbb/parallel_for.h>
 
 #include <Eigen/Core>
+#include <cstring>
 #include <tuple>
 #include <unsupported/Eigen/AlignedVector3>
 #include <vector>
@@ -24,51 +28,26 @@
 #include "open3d/utility/Helper.h"
 #include "open3d/utility/Logging.h"
 
-namespace {
-
-typedef Eigen::AlignedVector3<float> Vec3fa;
-// Dont force alignment for Vec2f because we use it just for storing
-typedef Eigen::Matrix<float, 2, 1, Eigen::DontAlign> Vec2f;
-typedef Eigen::Vector3f Vec3f;
-
-// Error function called by embree.
-void ErrorFunction(void* userPtr, enum RTCError error, const char* str) {
-    open3d::utility::LogError("Embree error: {} {}", rtcGetErrorString(error),
-                              str);
-}
+namespace callbacks {
 
-// Checks the last dim, ensures that the number of dims is >= min_ndim, checks
-// the device, and dtype.
-template <class DTYPE>
-void AssertTensorDtypeLastDimDeviceMinNDim(const open3d::core::Tensor& tensor,
-                                           const std::string& tensor_name,
-                                           int64_t last_dim,
-                                           const open3d::core::Device& device,
-                                           int64_t min_ndim = 2) {
-    open3d::core::AssertTensorDevice(tensor, device);
-    if (tensor.NumDims() < min_ndim) {
-        open3d::utility::LogError(
-                "{} Tensor ndim is {} but expected ndim >= {}", tensor_name,
-                tensor.NumDims(), min_ndim);
-    }
-    if (tensor.GetShape().back() != last_dim) {
-        open3d::utility::LogError(
-                "The last dimension of the {} Tensor must be {} but got "
-                "Tensor with shape {}",
-                tensor_name, last_dim, tensor.GetShape().ToString());
-    }
-    open3d::core::AssertTensorDtype(tensor,
-                                    open3d::core::Dtype::FromType<DTYPE>());
-}
+struct GeomPrimID {
+    uint32_t geomID;
+    uint32_t primID;
+    float ray_tfar;
+};
 
 struct CountIntersectionsContext {
     RTCRayQueryContext context;
-    std::vector<std::tuple<uint32_t, uint32_t, float>>*
-            previous_geom_prim_ID_tfar;
+    GeomPrimID* previous_geom_prim_ID_tfar;
     int* intersections;
 };
 
+#ifdef BUILD_SYCL_MODULE
+RTC_SYCL_INDIRECTLY_CALLABLE void CountIntersectionsFunc(
+        const RTCFilterFunctionNArguments* args) {
+#else
 void CountIntersectionsFunc(const RTCFilterFunctionNArguments* args) {
+#endif
     int* valid = args->valid;
     const CountIntersectionsContext* context =
             reinterpret_cast<const CountIntersectionsContext*>(args->context);
@@ -79,8 +58,8 @@ void CountIntersectionsFunc(const RTCFilterFunctionNArguments* args) {
     // Avoid crashing when debug visualizations are used.
     if (context == nullptr) return;
 
-    std::vector<std::tuple<uint32_t, uint32_t, float>>*
-            previous_geom_prim_ID_tfar = context->previous_geom_prim_ID_tfar;
+    GeomPrimID* previous_geom_prim_ID_tfar =
+            context->previous_geom_prim_ID_tfar;
     int* intersections = context->intersections;
 
     // Iterate over all rays in ray packet.
@@ -97,14 +76,13 @@ void CountIntersectionsFunc(const RTCFilterFunctionNArguments* args) {
         RTCHit hit = rtcGetHitFromHitN(hitN, N, ui);
 
         unsigned int ray_id = ray.id;
-        std::tuple<uint32_t, uint32_t, float> gpID(hit.geomID, hit.primID,
-                                                   ray.tfar);
-        auto& prev_gpIDtfar = previous_geom_prim_ID_tfar->operator[](ray_id);
-        if (std::get<0>(prev_gpIDtfar) != hit.geomID ||
-            (std::get<1>(prev_gpIDtfar) != hit.primID &&
-             std::get<2>(prev_gpIDtfar) != ray.tfar)) {
+        GeomPrimID gpID = {hit.geomID, hit.primID, ray.tfar};
+        auto& prev_gpIDtfar = previous_geom_prim_ID_tfar[ray_id];
+        if (prev_gpIDtfar.geomID != hit.geomID ||
+            (prev_gpIDtfar.primID != hit.primID &&
+             prev_gpIDtfar.ray_tfar != ray.tfar)) {
             ++(intersections[ray_id]);
-            previous_geom_prim_ID_tfar->operator[](ray_id) = gpID;
+            previous_geom_prim_ID_tfar[ray_id] = gpID;
         }
         // Always ignore hit
         valid[ui] = 0;
@@ -113,18 +91,22 @@ void CountIntersectionsFunc(const RTCFilterFunctionNArguments* args) {
 
 struct ListIntersectionsContext {
     RTCRayQueryContext context;
-    std::vector<std::tuple<uint32_t, uint32_t, float>>*
-            previous_geom_prim_ID_tfar;
+    GeomPrimID* previous_geom_prim_ID_tfar;
     unsigned int* ray_ids;
     unsigned int* geometry_ids;
     unsigned int* primitive_ids;
     float* primitive_uvs;
     float* t_hit;
-    Eigen::VectorXi cumsum;
+    int* cumsum;
     unsigned int* track_intersections;
 };
 
+#ifdef BUILD_SYCL_MODULE
+RTC_SYCL_INDIRECTLY_CALLABLE void ListIntersectionsFunc(
+        const RTCFilterFunctionNArguments* args) {
+#else
 void ListIntersectionsFunc(const RTCFilterFunctionNArguments* args) {
+#endif
     int* valid = args->valid;
     const ListIntersectionsContext* context =
             reinterpret_cast<const ListIntersectionsContext*>(args->context);
@@ -135,14 +117,14 @@ void ListIntersectionsFunc(const RTCFilterFunctionNArguments* args) {
     // Avoid crashing when debug visualizations are used.
     if (context == nullptr) return;
 
-    std::vector<std::tuple<uint32_t, uint32_t, float>>*
-            previous_geom_prim_ID_tfar = context->previous_geom_prim_ID_tfar;
+    GeomPrimID* previous_geom_prim_ID_tfar =
+            context->previous_geom_prim_ID_tfar;
     unsigned int* ray_ids = context->ray_ids;
     unsigned int* geometry_ids = context->geometry_ids;
     unsigned int* primitive_ids = context->primitive_ids;
     float* primitive_uvs = context->primitive_uvs;
     float* t_hit = context->t_hit;
-    Eigen::VectorXi cumsum = context->cumsum;
+    int* cumsum = context->cumsum;
     unsigned int* track_intersections = context->track_intersections;
 
     // Iterate over all rays in ray packet.
@@ -159,12 +141,11 @@ void ListIntersectionsFunc(const RTCFilterFunctionNArguments* args) {
         RTCHit hit = rtcGetHitFromHitN(hitN, N, ui);
 
         unsigned int ray_id = ray.id;
-        std::tuple<uint32_t, uint32_t, float> gpID(hit.geomID, hit.primID,
-                                                   ray.tfar);
-        auto& prev_gpIDtfar = previous_geom_prim_ID_tfar->operator[](ray_id);
-        if (std::get<0>(prev_gpIDtfar) != hit.geomID ||
-            (std::get<1>(prev_gpIDtfar) != hit.primID &&
-             std::get<2>(prev_gpIDtfar) != ray.tfar)) {
+        GeomPrimID gpID = {hit.geomID, hit.primID, ray.tfar};
+        auto& prev_gpIDtfar = previous_geom_prim_ID_tfar[ray_id];
+        if (prev_gpIDtfar.geomID != hit.geomID ||
+            (prev_gpIDtfar.primID != hit.primID &&
+             prev_gpIDtfar.ray_tfar != ray.tfar)) {
             size_t idx = cumsum[ray_id] + track_intersections[ray_id];
             ray_ids[idx] = ray_id;
             geometry_ids[idx] = hit.geomID;
@@ -172,7 +153,7 @@ void ListIntersectionsFunc(const RTCFilterFunctionNArguments* args) {
             primitive_uvs[idx * 2 + 0] = hit.u;
             primitive_uvs[idx * 2 + 1] = hit.v;
             t_hit[idx] = ray.tfar;
-            previous_geom_prim_ID_tfar->operator[](ray_id) = gpID;
+            previous_geom_prim_ID_tfar[ray_id] = gpID;
             ++(track_intersections[ray_id]);
         }
         // Always ignore hit
@@ -180,16 +161,56 @@ void ListIntersectionsFunc(const RTCFilterFunctionNArguments* args) {
     }
 }
 
+}  // namespace callbacks
+
+namespace {
+
+typedef Eigen::AlignedVector3<float> Vec3fa;
+// Dont force alignment for Vec2f because we use it just for storing
+typedef Eigen::Matrix<float, 2, 1, Eigen::DontAlign> Vec2f;
+typedef Eigen::Vector3f Vec3f;
+
+// Error function called by embree.
+void ErrorFunction(void* userPtr, enum RTCError error, const char* str) {
+    open3d::utility::LogError("Embree error: {} {}", rtcGetErrorString(error),
+                              str);
+}
+
+// Checks the last dim, ensures that the number of dims is >= min_ndim, checks
+// the device, and dtype.
+template <class DTYPE>
+void AssertTensorDtypeLastDimDeviceMinNDim(const open3d::core::Tensor& tensor,
+                                           const std::string& tensor_name,
+                                           int64_t last_dim,
+                                           const open3d::core::Device& device,
+                                           int64_t min_ndim = 2) {
+    open3d::core::AssertTensorDevice(tensor, device);
+    if (tensor.NumDims() < min_ndim) {
+        open3d::utility::LogError(
+                "{} Tensor ndim is {} but expected ndim >= {}", tensor_name,
+                tensor.NumDims(), min_ndim);
+    }
+    if (tensor.GetShape().back() != last_dim) {
+        open3d::utility::LogError(
+                "The last dimension of the {} Tensor must be {} but got "
+                "Tensor with shape {}",
+                tensor_name, last_dim, tensor.GetShape().ToString());
+    }
+    open3d::core::AssertTensorDtype(tensor,
+                                    open3d::core::Dtype::FromType<DTYPE>());
+}
+
 // Adapted from common/math/closest_point.h
-inline Vec3fa closestPointTriangle(Vec3fa const& p,
-                                   Vec3fa const& a,
-                                   Vec3fa const& b,
-                                   Vec3fa const& c,
-                                   float& tex_u,
-                                   float& tex_v) {
-    const Vec3fa ab = b - a;
-    const Vec3fa ac = c - a;
-    const Vec3fa ap = p - a;
+template <typename Vec3faType>
+inline Vec3faType closestPointTriangle(Vec3faType const& p,
+                                       Vec3faType const& a,
+                                       Vec3faType const& b,
+                                       Vec3faType const& c,
+                                       float& tex_u,
+                                       float& tex_v) {
+    const Vec3faType ab = b - a;
+    const Vec3faType ac = c - a;
+    const Vec3faType ap = p - a;
 
     const float d1 = ab.dot(ap);
     const float d2 = ac.dot(ap);
@@ -199,7 +220,7 @@ inline Vec3fa closestPointTriangle(Vec3fa const& p,
         return a;
     }
 
-    const Vec3fa bp = p - b;
+    const Vec3faType bp = p - b;
     const float d3 = ab.dot(bp);
     const float d4 = ac.dot(bp);
     if (d3 >= 0.f && d4 <= d3) {
@@ -208,7 +229,7 @@ inline Vec3fa closestPointTriangle(Vec3fa const& p,
         return b;
     }
 
-    const Vec3fa cp = p - c;
+    const Vec3faType cp = p - c;
     const float d5 = ab.dot(cp);
     const float d6 = ac.dot(cp);
     if (d6 >= 0.f && d5 <= d6) {
@@ -249,56 +270,64 @@ inline Vec3fa closestPointTriangle(Vec3fa const& p,
     return a + v * ab + w * ac;
 }
 
+struct GeometryPtr {
+    RTCGeometryType geom_type;
+    const void* ptr1;
+    const void* ptr2;
+};
+
+template <typename Vec3fType, typename Vec2fType>
 struct ClosestPointResult {
     ClosestPointResult()
-        : primID(RTC_INVALID_GEOMETRY_ID),
-          geomID(RTC_INVALID_GEOMETRY_ID),
-          geometry_ptrs_ptr() {}
+        : primID(RTC_INVALID_GEOMETRY_ID), geomID(RTC_INVALID_GEOMETRY_ID) {}
 
-    Vec3f p;
+    Vec3fType p;
     unsigned int primID;
     unsigned int geomID;
-    Vec2f uv;
-    Vec3f n;
-    std::vector<std::tuple<RTCGeometryType, const void*, const void*>>*
-            geometry_ptrs_ptr;
+    Vec2fType uv;
+    Vec3fType n;
+    GeometryPtr* geometry_ptrs_ptr;
 };
 
 // Code adapted from the embree closest_point tutorial.
+template <typename Vec3fType, typename Vec3faType, typename Vec2fType>
 bool ClosestPointFunc(RTCPointQueryFunctionArguments* args) {
     assert(args->userPtr);
     const unsigned int geomID = args->geomID;
     const unsigned int primID = args->primID;
 
     // query position in world space
-    Vec3fa q(args->query->x, args->query->y, args->query->z);
+    Vec3faType q(args->query->x, args->query->y, args->query->z);
 
-    ClosestPointResult* result =
-            static_cast<ClosestPointResult*>(args->userPtr);
+    ClosestPointResult<Vec3fType, Vec2fType>* result =
+            static_cast<ClosestPointResult<Vec3fType, Vec2fType>*>(
+                    args->userPtr);
     const RTCGeometryType geom_type =
-            std::get<0>(result->geometry_ptrs_ptr->operator[](geomID));
-    const void* ptr1 =
-            std::get<1>(result->geometry_ptrs_ptr->operator[](geomID));
-    const void* ptr2 =
-            std::get<2>(result->geometry_ptrs_ptr->operator[](geomID));
+            result->geometry_ptrs_ptr[geomID].geom_type;
+    const void* ptr1 = result->geometry_ptrs_ptr[geomID].ptr1;
+    const void* ptr2 = result->geometry_ptrs_ptr[geomID].ptr2;
 
     if (RTC_GEOMETRY_TYPE_TRIANGLE == geom_type) {
         const float* vertex_positions = (const float*)ptr1;
         const uint32_t* triangle_indices = (const uint32_t*)ptr2;
 
-        Vec3fa v0(vertex_positions[3 * triangle_indices[3 * primID + 0] + 0],
-                  vertex_positions[3 * triangle_indices[3 * primID + 0] + 1],
-                  vertex_positions[3 * triangle_indices[3 * primID + 0] + 2]);
-        Vec3fa v1(vertex_positions[3 * triangle_indices[3 * primID + 1] + 0],
-                  vertex_positions[3 * triangle_indices[3 * primID + 1] + 1],
-                  vertex_positions[3 * triangle_indices[3 * primID + 1] + 2]);
-        Vec3fa v2(vertex_positions[3 * triangle_indices[3 * primID + 2] + 0],
-                  vertex_positions[3 * triangle_indices[3 * primID + 2] + 1],
-                  vertex_positions[3 * triangle_indices[3 * primID + 2] + 2]);
+        Vec3faType v0(
+                vertex_positions[3 * triangle_indices[3 * primID + 0] + 0],
+                vertex_positions[3 * triangle_indices[3 * primID + 0] + 1],
+                vertex_positions[3 * triangle_indices[3 * primID + 0] + 2]);
+        Vec3faType v1(
+                vertex_positions[3 * triangle_indices[3 * primID + 1] + 0],
+                vertex_positions[3 * triangle_indices[3 * primID + 1] + 1],
+                vertex_positions[3 * triangle_indices[3 * primID + 1] + 2]);
+        Vec3faType v2(
+                vertex_positions[3 * triangle_indices[3 * primID + 2] + 0],
+                vertex_positions[3 * triangle_indices[3 * primID + 2] + 1],
+                vertex_positions[3 * triangle_indices[3 * primID + 2] + 2]);
 
         // Determine distance to closest point on triangle
         float u, v;
-        const Vec3fa p = closestPointTriangle(q, v0, v1, v2, u, v);
+        const Vec3faType p =
+                closestPointTriangle<Vec3faType>(q, v0, v1, v2, u, v);
         float d = (q - p).norm();
 
         // Store result in userPtr and update the query radius if we found a
@@ -309,9 +338,9 @@ bool ClosestPointFunc(RTCPointQueryFunctionArguments* args) {
             result->p = p;
             result->primID = primID;
             result->geomID = geomID;
-            Vec3fa e1 = v1 - v0;
-            Vec3fa e2 = v2 - v0;
-            result->uv = Vec2f(u, v);
+            Vec3faType e1 = v1 - v0;
+            Vec3faType e2 = v2 - v0;
+            result->uv = Vec2fType(u, v);
             result->n = (e1.cross(e2)).normalized();
             return true;  // Return true to indicate that the query radius
                           // changed.
@@ -327,18 +356,17 @@ namespace t {
 namespace geometry {
 
 struct RaycastingScene::Impl {
-    // The maximum number of rays used in calls to embree.
-    const size_t BATCH_SIZE = 1024;
-    RTCDevice device_;
     RTCScene scene_;
     bool scene_committed_;  // true if the scene has been committed.
+    RTCDevice device_;
     // Vector for storing some information about the added geometry.
-    std::vector<std::tuple<RTCGeometryType, const void*, const void*>>
-            geometry_ptrs_;
-    core::Device tensor_device_;  // cpu
+    std::vector<GeometryPtr> geometry_ptrs_;
+    core::Device tensor_device_;  // cpu or sycl
 
     bool devprop_join_commit;
 
+    virtual ~Impl() = default;
+
     void CommitScene() {
         if (!scene_committed_) {
             if (devprop_join_commit) {
@@ -350,7 +378,100 @@ struct RaycastingScene::Impl {
         }
     }
 
-    template <bool LINE_INTERSECTION>
+    virtual void CastRays(const float* const rays,
+                          const size_t num_rays,
+                          float* t_hit,
+                          unsigned int* geometry_ids,
+                          unsigned int* primitive_ids,
+                          float* primitive_uvs,
+                          float* primitive_normals,
+                          const int nthreads,
+                          const bool line_intersection) = 0;
+
+    virtual void TestOcclusions(const float* const rays,
+                                const size_t num_rays,
+                                const float tnear,
+                                const float tfar,
+                                int8_t* occluded,
+                                const int nthreads) = 0;
+
+    virtual void CountIntersections(const float* const rays,
+                                    const size_t num_rays,
+                                    int* intersections,
+                                    const int nthreads) = 0;
+
+    virtual void ListIntersections(const float* const rays,
+                                   const size_t num_rays,
+                                   const size_t num_intersections,
+                                   int* cumsum,
+                                   unsigned int* track_intersections,
+                                   unsigned int* ray_ids,
+                                   unsigned int* geometry_ids,
+                                   unsigned int* primitive_ids,
+                                   float* primitive_uvs,
+                                   float* t_hit,
+                                   const int nthreads) = 0;
+
+    virtual void ComputeClosestPoints(const float* const query_points,
+                                      const size_t num_query_points,
+                                      float* closest_points,
+                                      unsigned int* geometry_ids,
+                                      unsigned int* primitive_ids,
+                                      float* primitive_uvs,
+                                      float* primitive_normals,
+                                      const int nthreads) = 0;
+
+    virtual void ArraySum(int* data_ptr,
+                          size_t num_elements,
+                          size_t& result) = 0;
+
+    virtual void ArrayPartialSum(int* input,
+                                 int* output,
+                                 size_t num_elements) = 0;
+
+    virtual void CopyArray(int* src, uint32_t* dst, size_t num_elements) = 0;
+};
+
+#ifdef BUILD_SYCL_MODULE
+struct RaycastingScene::SYCLImpl : public RaycastingScene::Impl {
+    // SYCL variables
+    sycl::queue queue_;
+    sycl::context context_;
+    sycl::device sycl_device_;
+
+    callbacks::GeomPrimID* li_previous_geom_prim_ID_tfar = nullptr;
+    callbacks::GeomPrimID* ci_previous_geom_prim_ID_tfar = nullptr;
+
+    ~SYCLImpl() {
+        if (li_previous_geom_prim_ID_tfar) {
+            sycl::free(li_previous_geom_prim_ID_tfar, queue_);
+        }
+        if (ci_previous_geom_prim_ID_tfar) {
+            sycl::free(ci_previous_geom_prim_ID_tfar, queue_);
+        }
+    }
+
+    void InitializeDevice() {
+        try {
+            sycl_device_ = sycl::device(rtcSYCLDeviceSelector);
+        } catch (std::exception& e) {
+            utility::LogError("Caught exception creating sycl::device: {}",
+                              e.what());
+            return;
+        }
+
+        queue_ = sycl::queue(sycl_device_);
+        context_ = sycl::context(sycl_device_);
+
+        device_ = rtcNewSYCLDevice(context_, "");
+        rtcSetDeviceSYCLDevice(device_, sycl_device_);
+
+        if (!device_) {
+            utility::LogError("Error %d: cannot create device\n",
+                              rtcGetDeviceError(NULL));
+        }
+    }
+
     void CastRays(const float* const rays,
                   const size_t num_rays,
                   float* t_hit,
@@ -358,7 +479,361 @@ struct RaycastingScene::Impl {
                   unsigned int* primitive_ids,
                   float* primitive_uvs,
                   float* primitive_normals,
-                  const int nthreads) {
+                  const int nthreads,
+                  const bool line_intersection) override {
+        CommitScene();
+
+        auto scene = this->scene_;
+        queue_.submit([=](sycl::handler& cgh) {
+            cgh.parallel_for(
+                    sycl::range<1>(num_rays),
+                    [=](sycl::item<1> item, sycl::kernel_handler kh) {
+                        const size_t i = item.get_id(0);
+
+                        struct RTCRayHit rh;
+                        const float* r = &rays[i * 6];
+                        rh.ray.org_x = r[0];
+                        rh.ray.org_y = r[1];
+                        rh.ray.org_z = r[2];
+                        if (line_intersection) {
+                            rh.ray.dir_x = r[3] - r[0];
+                            rh.ray.dir_y = r[4] - r[1];
+                            rh.ray.dir_z = r[5] - r[2];
+                        } else {
+                            rh.ray.dir_x = r[3];
+                            rh.ray.dir_y = r[4];
+                            rh.ray.dir_z = r[5];
+                        }
+                        rh.ray.tnear = 0;
+                        if (line_intersection) {
+                            rh.ray.tfar = 1.f;
+                        } else {
+                            rh.ray.tfar =
+                                    std::numeric_limits<float>::infinity();
+                        }
+                        rh.ray.mask = -1;
+                        rh.ray.id = i;
+                        rh.ray.flags = 0;
+                        rh.hit.geomID = RTC_INVALID_GEOMETRY_ID;
+                        rh.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
+
+                        rtcIntersect1(scene, &rh);
+
+                        t_hit[i] = rh.ray.tfar;
+                        if (rh.hit.geomID != RTC_INVALID_GEOMETRY_ID) {
+                            geometry_ids[i] = rh.hit.geomID;
+                            primitive_ids[i] = rh.hit.primID;
+                            primitive_uvs[i * 2 + 0] = rh.hit.u;
+                            primitive_uvs[i * 2 + 1] = rh.hit.v;
+                            float inv_norm =
+                                    1.f / std::sqrt(rh.hit.Ng_x * rh.hit.Ng_x +
+                                                    rh.hit.Ng_y * rh.hit.Ng_y +
+                                                    rh.hit.Ng_z * rh.hit.Ng_z);
+                            primitive_normals[i * 3 + 0] =
+                                    rh.hit.Ng_x * inv_norm;
+                            primitive_normals[i * 3 + 1] =
+                                    rh.hit.Ng_y * inv_norm;
+                            primitive_normals[i * 3 + 2] =
+                                    rh.hit.Ng_z * inv_norm;
+                        } else {
+                            geometry_ids[i] = RTC_INVALID_GEOMETRY_ID;
+                            primitive_ids[i] = RTC_INVALID_GEOMETRY_ID;
+                            primitive_uvs[i * 2 + 0] = 0;
+                            primitive_uvs[i * 2 + 1] = 0;
+                            primitive_normals[i * 3 + 0] = 0;
+                            primitive_normals[i * 3 + 1] = 0;
+                            primitive_normals[i * 3 + 2] = 0;
+                        }
+                    });
+        });
+        queue_.wait_and_throw();
+    }
+
+    void TestOcclusions(const float* const rays,
+                        const size_t num_rays,
+                        const float tnear,
+                        const float tfar,
+                        int8_t* occluded,
+                        const int nthreads) override {
+        CommitScene();
+
+        auto scene = this->scene_;
+        queue_.submit([=](sycl::handler& cgh) {
+            cgh.parallel_for(
+                    sycl::range<1>(num_rays),
+                    [=](sycl::item<1> item, sycl::kernel_handler kh) {
+                        struct RTCRayQueryContext context;
+                        rtcInitRayQueryContext(&context);
+
+                        RTCOccludedArguments args;
+                        rtcInitOccludedArguments(&args);
+                        args.context = &context;
+
+                        const size_t i = item.get_id(0);
+
+                        struct RTCRay ray;
+                        const float* r = &rays[i * 6];
+                        ray.org_x = r[0];
+                        ray.org_y = r[1];
+                        ray.org_z = r[2];
+                        ray.dir_x = r[3];
+                        ray.dir_y = r[4];
+                        ray.dir_z = r[5];
+                        ray.tnear = tnear;
+                        ray.tfar = tfar;
+                        ray.mask = -1;
+                        ray.id = i;
+                        ray.flags = 0;
+
+                        rtcOccluded1(scene, &ray, &args);
+
+                        occluded[i] = int8_t(
+                                -std::numeric_limits<float>::infinity() ==
+                                ray.tfar);
+                    });
+        });
+        queue_.wait_and_throw();
+    }
+
+    void CountIntersections(const float* const rays,
+                            const size_t num_rays,
+                            int* intersections,
+                            const int nthreads) override {
+        CommitScene();
+
+        queue_.memset(intersections, 0, sizeof(int) * num_rays).wait();
+
+        ci_previous_geom_prim_ID_tfar =
+                sycl::malloc_device<callbacks::GeomPrimID>(num_rays, queue_);
+
+        // Check if allocation was successful
+        if (!ci_previous_geom_prim_ID_tfar) {
+            throw std::runtime_error("Failed to allocate device memory");
+        }
+
+        auto host_previous_geom_prim_ID_tfar =
+                std::unique_ptr<callbacks::GeomPrimID[],
+                                std::default_delete<callbacks::GeomPrimID[]>>(
+                        new callbacks::GeomPrimID[num_rays]);
+        for (size_t i = 0; i < num_rays; ++i) {
+            host_previous_geom_prim_ID_tfar[i] = {
+                    uint32_t(RTC_INVALID_GEOMETRY_ID),
+                    uint32_t(RTC_INVALID_GEOMETRY_ID), 0.f};
+        }
+
+        // Copy the initialized data to the device
+        queue_.memcpy(ci_previous_geom_prim_ID_tfar,
+                      host_previous_geom_prim_ID_tfar.get(),
+                      num_rays * sizeof(callbacks::GeomPrimID))
+                .wait();
+
+        auto scene = this->scene_;
+        auto ci_previous_geom_prim_ID_tfar_ = ci_previous_geom_prim_ID_tfar;
+        queue_.submit([=](sycl::handler& cgh) {
+            cgh.parallel_for(
+                    sycl::range<1>(num_rays),
+                    [=](sycl::item<1> item, sycl::kernel_handler kh) {
+                        callbacks::CountIntersectionsContext context;
+                        rtcInitRayQueryContext(&context.context);
+                        context.previous_geom_prim_ID_tfar =
+                                ci_previous_geom_prim_ID_tfar_;
+                        context.intersections = intersections;
+
+                        RTCIntersectArguments args;
+                        rtcInitIntersectArguments(&args);
+                        args.filter = callbacks::CountIntersectionsFunc;
+                        args.context = &context.context;
+
+                        const size_t i = item.get_id(0);
+
+                        struct RTCRayHit rh;
+                        const float* r = &rays[i * 6];
+                        rh.ray.org_x = r[0];
+                        rh.ray.org_y = r[1];
+                        rh.ray.org_z = r[2];
+                        rh.ray.dir_x = r[3];
+                        rh.ray.dir_y = r[4];
+                        rh.ray.dir_z = r[5];
+                        rh.ray.tnear = 0;
+                        rh.ray.tfar = std::numeric_limits<float>::infinity();
+                        rh.ray.mask = -1;
+                        rh.ray.flags = 0;
+                        rh.ray.id = i;
+                        rh.hit.geomID = RTC_INVALID_GEOMETRY_ID;
+                        rh.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
+
+                        rtcIntersect1(scene, &rh, &args);
+                    });
+        });
+        queue_.wait_and_throw();
+
+        // Free the allocated memory
+        sycl::free(ci_previous_geom_prim_ID_tfar, queue_);
+        ci_previous_geom_prim_ID_tfar = nullptr;
+    }
+
+    void ListIntersections(const float* const rays,
+                           const size_t num_rays,
+                           const size_t num_intersections,
+                           int* cumsum,
+                           unsigned int* track_intersections,
+                           unsigned int* ray_ids,
+                           unsigned int* geometry_ids,
+                           unsigned int* primitive_ids,
+                           float* primitive_uvs,
+                           float* t_hit,
+                           const int nthreads) override {
+        CommitScene();
+
+        queue_.memset(track_intersections, 0, sizeof(uint32_t) * num_rays)
+                .wait();
+        queue_.memset(ray_ids, 0, sizeof(uint32_t) * num_intersections).wait();
+        queue_.memset(geometry_ids, 0, sizeof(uint32_t) * num_intersections)
+                .wait();
+        queue_.memset(primitive_ids, 0, sizeof(uint32_t) * num_intersections)
+                .wait();
+        queue_.memset(primitive_uvs, 0, sizeof(float) * num_intersections * 2)
+                .wait();
+        queue_.memset(t_hit, 0, sizeof(float) * num_intersections).wait();
+
+        li_previous_geom_prim_ID_tfar =
+                sycl::malloc_device<callbacks::GeomPrimID>(num_rays, queue_);
+
+        // Check if allocation was successful
+        if (!li_previous_geom_prim_ID_tfar) {
+            throw std::runtime_error("Failed to allocate device memory");
+        }
+
+        auto host_previous_geom_prim_ID_tfar =
+                std::unique_ptr<callbacks::GeomPrimID[],
+                                std::default_delete<callbacks::GeomPrimID[]>>(
+                        new callbacks::GeomPrimID[num_rays]);
+        for (size_t i = 0; i < num_rays; ++i) {
+            host_previous_geom_prim_ID_tfar[i] = {
+                    uint32_t(RTC_INVALID_GEOMETRY_ID),
+                    uint32_t(RTC_INVALID_GEOMETRY_ID), 0.f};
+        }
+
+        // Copy the initialized data to the device
+        queue_.memcpy(li_previous_geom_prim_ID_tfar,
+                      host_previous_geom_prim_ID_tfar.get(),
+                      num_rays * sizeof(callbacks::GeomPrimID))
+                .wait();
+
+        auto scene = this->scene_;
+        auto li_previous_geom_prim_ID_tfar_ = li_previous_geom_prim_ID_tfar;
+        queue_.submit([=](sycl::handler& cgh) {
+            cgh.parallel_for(
+                    sycl::range<1>(num_rays),
+                    [=](sycl::item<1> item, sycl::kernel_handler kh) {
+                        callbacks::ListIntersectionsContext context;
+                        rtcInitRayQueryContext(&context.context);
+                        context.previous_geom_prim_ID_tfar =
+                                li_previous_geom_prim_ID_tfar_;
+                        context.ray_ids = ray_ids;
+                        context.geometry_ids = geometry_ids;
+                        context.primitive_ids = primitive_ids;
+                        context.primitive_uvs = primitive_uvs;
+                        context.t_hit = t_hit;
+                        context.cumsum = cumsum;
+                        context.track_intersections = track_intersections;
+
+                        RTCIntersectArguments args;
+                        rtcInitIntersectArguments(&args);
+                        args.filter = callbacks::ListIntersectionsFunc;
+                        args.context = &context.context;
+
+                        const size_t i = item.get_id(0);
+
+                        struct RTCRayHit rh;
+                        const float* r = &rays[i * 6];
+                        rh.ray.org_x = r[0];
+                        rh.ray.org_y = r[1];
+                        rh.ray.org_z = r[2];
+                        rh.ray.dir_x = r[3];
+                        rh.ray.dir_y = r[4];
+                        rh.ray.dir_z = r[5];
+                        rh.ray.tnear = 0;
+                        rh.ray.tfar = std::numeric_limits<float>::infinity();
+                        rh.ray.mask = -1;
+                        rh.ray.flags = 0;
+                        rh.ray.id = i;
+                        rh.hit.geomID = RTC_INVALID_GEOMETRY_ID;
+                        rh.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
+
+                        rtcIntersect1(scene, &rh, &args);
+                    });
+        });
+        queue_.wait_and_throw();
+
+        // Free the allocated memory
+        sycl::free(li_previous_geom_prim_ID_tfar, queue_);
+        li_previous_geom_prim_ID_tfar = nullptr;
+    }
+
+    void ComputeClosestPoints(const float* const query_points,
+                              const size_t num_query_points,
+                              float* closest_points,
+                              unsigned int* geometry_ids,
+                              unsigned int* primitive_ids,
+                              float* primitive_uvs,
+                              float* primitive_normals,
+                              const int nthreads) override {
+        throw std::logic_error("Function not yet implemented");
+    }
+
+    void ArraySum(int* data_ptr, size_t num_elements, size_t& result) override {
+        sycl::buffer<size_t, 1> result_buf(&result, sycl::range<1>(1));
+
+        queue_.submit([&](sycl::handler& cgh) {
+            auto result_acc =
+                    result_buf.get_access<sycl::access::mode::read_write>(cgh);
+            cgh.parallel_for(
+                    sycl::range<1>(num_elements),
+                    [=](sycl::item<1> item, sycl::kernel_handler kh) {
+                        const size_t i = item.get_id(0);
+                        sycl::atomic_ref<size_t, sycl::memory_order::relaxed,
+                                         sycl::memory_scope::device>
+                                atomic_result_data(result_acc[0]);
+                        atomic_result_data.fetch_add(data_ptr[i]);
+                    });
+        });
+        queue_.wait_and_throw();
+    }
+
+    void ArrayPartialSum(int* input,
+                         int* output,
+                         size_t num_elements) override {
+        queue_.submit([&](sycl::handler& cgh) {
+            cgh.single_task([=]() {
+                for (size_t idx = 1; idx < num_elements; ++idx) {
+                    output[idx] = output[idx - 1] + input[idx - 1];
+                }
+            });
+        });
+
+        queue_.wait_and_throw();
+    }
+
+    void CopyArray(int* src, uint32_t* dst, size_t num_elements) override {
+        queue_.memcpy(dst, src, num_elements * sizeof(uint32_t)).wait();
+    }
+};
+#endif
+
+struct RaycastingScene::CPUImpl : public RaycastingScene::Impl {
+    // The maximum number of rays used in calls to embree.
+    const size_t BATCH_SIZE = 1024;
+
+    void CastRays(const float* const rays,
+                  const size_t num_rays,
+                  float* t_hit,
+                  unsigned int* geometry_ids,
+                  unsigned int* primitive_ids,
+                  float* primitive_uvs,
+                  float* primitive_normals,
+                  const int nthreads,
+                  const bool line_intersection) override {
         CommitScene();
 
         auto LoopFn = [&](const tbb::blocked_range<size_t>& range) {
@@ -370,7 +845,7 @@ struct RaycastingScene::Impl {
                 rh.ray.org_x = r[0];
                 rh.ray.org_y = r[1];
                 rh.ray.org_z = r[2];
-                if (LINE_INTERSECTION) {
+                if (line_intersection) {
                     rh.ray.dir_x = r[3] - r[0];
                     rh.ray.dir_y = r[4] - r[1];
                     rh.ray.dir_z = r[5] - r[2];
@@ -380,7 +855,7 @@ struct RaycastingScene::Impl {
                     rh.ray.dir_z = r[5];
                 }
                 rh.ray.tnear = 0;
-                if (LINE_INTERSECTION) {
+                if (line_intersection) {
                     rh.ray.tfar = 1.f;
                 } else {
                     rh.ray.tfar = std::numeric_limits<float>::infinity();
@@ -440,7 +915,7 @@ struct RaycastingScene::Impl {
                         const float tnear,
                         const float tfar,
                         int8_t* occluded,
-                        const int nthreads) {
+                        const int nthreads) override {
         CommitScene();
 
         struct RTCRayQueryContext context;
@@ -495,26 +970,29 @@ struct RaycastingScene::Impl {
     void CountIntersections(const float* const rays,
                             const size_t num_rays,
                             int* intersections,
-                            const int nthreads) {
+                            const int nthreads) override {
         CommitScene();
 
-        memset(intersections, 0, sizeof(int) * num_rays);
+        std::memset(intersections, 0, sizeof(int) * num_rays);
 
-        std::vector<std::tuple<uint32_t, uint32_t, float>>
-                previous_geom_prim_ID_tfar(
-                        num_rays,
-                        std::make_tuple(uint32_t(RTC_INVALID_GEOMETRY_ID),
-                                        uint32_t(RTC_INVALID_GEOMETRY_ID),
-                                        0.f));
+        auto previous_geom_prim_ID_tfar =
+                std::unique_ptr<callbacks::GeomPrimID[],
+                                std::default_delete<callbacks::GeomPrimID[]>>(
+                        new callbacks::GeomPrimID[num_rays]);
+        for (size_t i = 0; i < num_rays; ++i) {
+            previous_geom_prim_ID_tfar[i] = {uint32_t(RTC_INVALID_GEOMETRY_ID),
+                                             uint32_t(RTC_INVALID_GEOMETRY_ID),
+                                             0.f};
+        }
 
-        CountIntersectionsContext context;
+        callbacks::CountIntersectionsContext context;
         rtcInitRayQueryContext(&context.context);
-        context.previous_geom_prim_ID_tfar = &previous_geom_prim_ID_tfar;
+        context.previous_geom_prim_ID_tfar = previous_geom_prim_ID_tfar.get();
         context.intersections = intersections;
 
         RTCIntersectArguments args;
         rtcInitIntersectArguments(&args);
-        args.filter = CountIntersectionsFunc;
+        args.filter = callbacks::CountIntersectionsFunc;
         args.context = &context.context;
 
         auto LoopFn = [&](const tbb::blocked_range<size_t>& range) {
@@ -558,33 +1036,36 @@ struct RaycastingScene::Impl {
     void ListIntersections(const float* const rays,
                            const size_t num_rays,
                            const size_t num_intersections,
-                           const Eigen::VectorXi& cumsum,
+                           int* cumsum,
                            unsigned int* track_intersections,
                            unsigned int* ray_ids,
                            unsigned int* geometry_ids,
                            unsigned int* primitive_ids,
                            float* primitive_uvs,
                            float* t_hit,
-                           const int nthreads) {
+                           const int nthreads) override {
         CommitScene();
 
-        memset(track_intersections, 0, sizeof(uint32_t) * num_rays);
-        memset(ray_ids, 0, sizeof(uint32_t) * num_intersections);
-        memset(geometry_ids, 0, sizeof(uint32_t) * num_intersections);
-        memset(primitive_ids, 0, sizeof(uint32_t) * num_intersections);
-        memset(primitive_uvs, 0, sizeof(float) * num_intersections * 2);
-        memset(t_hit, 0, sizeof(float) * num_intersections);
-
-        std::vector<std::tuple<uint32_t, uint32_t, float>>
-                previous_geom_prim_ID_tfar(
-                        num_rays,
-                        std::make_tuple(uint32_t(RTC_INVALID_GEOMETRY_ID),
-                                        uint32_t(RTC_INVALID_GEOMETRY_ID),
-                                        0.f));
-
-        ListIntersectionsContext context;
+        std::memset(track_intersections, 0, sizeof(uint32_t) * num_rays);
+        std::memset(ray_ids, 0, sizeof(uint32_t) * num_intersections);
+        std::memset(geometry_ids, 0, sizeof(uint32_t) * num_intersections);
+        std::memset(primitive_ids, 0, sizeof(uint32_t) * num_intersections);
+        std::memset(primitive_uvs, 0, sizeof(float) * num_intersections * 2);
+        std::memset(t_hit, 0, sizeof(float) * num_intersections);
+
+        auto previous_geom_prim_ID_tfar =
+                std::unique_ptr<callbacks::GeomPrimID[],
+                                std::default_delete<callbacks::GeomPrimID[]>>(
+                        new callbacks::GeomPrimID[num_rays]);
+        for (size_t i = 0; i < num_rays; ++i) {
+            previous_geom_prim_ID_tfar[i] = {uint32_t(RTC_INVALID_GEOMETRY_ID),
+                                             uint32_t(RTC_INVALID_GEOMETRY_ID),
+                                             0.f};
+        }
+
+        callbacks::ListIntersectionsContext context;
         rtcInitRayQueryContext(&context.context);
-        context.previous_geom_prim_ID_tfar = &previous_geom_prim_ID_tfar;
+        context.previous_geom_prim_ID_tfar = previous_geom_prim_ID_tfar.get();
         context.ray_ids = ray_ids;
         context.geometry_ids = geometry_ids;
         context.primitive_ids = primitive_ids;
@@ -595,7 +1076,7 @@ struct RaycastingScene::Impl {
 
         RTCIntersectArguments args;
         rtcInitIntersectArguments(&args);
-        args.filter = ListIntersectionsFunc;
+        args.filter = callbacks::ListIntersectionsFunc;
         args.context = &context.context;
 
         auto LoopFn = [&](const tbb::blocked_range<size_t>& range) {
@@ -643,7 +1124,7 @@ struct RaycastingScene::Impl {
                               unsigned int* primitive_ids,
                               float* primitive_uvs,
                               float* primitive_normals,
-                              const int nthreads) {
+                              const int nthreads) override {
         CommitScene();
 
         auto LoopFn = [&](const tbb::blocked_range<size_t>& range) {
@@ -655,12 +1136,13 @@ struct RaycastingScene::Impl {
                 query.radius = std::numeric_limits<float>::infinity();
                 query.time = 0.f;
 
-                ClosestPointResult result;
-                result.geometry_ptrs_ptr = &geometry_ptrs_;
+                ClosestPointResult<Vec3f, Vec2f> result;
+                result.geometry_ptrs_ptr = geometry_ptrs_.data();
 
                 RTCPointQueryContext instStack;
                 rtcInitPointQueryContext(&instStack);
-                rtcPointQuery(scene_, &query, &instStack, &ClosestPointFunc,
+                rtcPointQuery(scene_, &query, &instStack,
+                              &ClosestPointFunc<Vec3f, Vec3fa, Vec2f>,
                               (void*)&result);
 
                 closest_points[3 * i + 0] = result.p.x();
@@ -689,16 +1171,44 @@ struct RaycastingScene::Impl {
                     LoopFn);
         }
     }
+
+    void ArraySum(int* data_ptr, size_t num_elements, size_t& result) override {
+        result = std::accumulate(data_ptr, data_ptr + num_elements, result);
+    }
+
+    void ArrayPartialSum(int* input,
+                         int* output,
+                         size_t num_elements) override {
+        output[0] = 0;
+        std::partial_sum(input, input + num_elements - 1, output + 1);
+    }
+
+    void CopyArray(int* src, uint32_t* dst, size_t num_elements) override {
+        std::copy(src, src + num_elements, dst);
+    }
 };
 
-RaycastingScene::RaycastingScene(int64_t nthreads)
-    : impl_(new RaycastingScene::Impl()) {
-    if (nthreads > 0) {
-        std::string config("threads=" + std::to_string(nthreads));
-        impl_->device_ = rtcNewDevice(config.c_str());
+RaycastingScene::RaycastingScene(int64_t nthreads, const core::Device& device) {
+#ifdef BUILD_SYCL_MODULE
+    if (device.IsSYCL()) {
+        impl_ = std::make_unique<SYCLImpl>();
+        dynamic_cast<RaycastingScene::SYCLImpl*>(impl_.get())
+                ->InitializeDevice();
+        impl_->tensor_device_ = device;
     } else {
-        impl_->device_ = rtcNewDevice(NULL);
+#endif
+        impl_ = std::make_unique<CPUImpl>();
+
+        if (nthreads > 0) {
+            std::string config("threads=" + std::to_string(nthreads));
+            impl_->device_ = rtcNewDevice(config.c_str());
+        } else {
+            impl_->device_ = rtcNewDevice(NULL);
+        }
+#ifdef BUILD_SYCL_MODULE
     }
+#endif
+
     rtcSetDeviceErrorFunction(impl_->device_, ErrorFunction, NULL);
 
     impl_->scene_ = rtcNewScene(impl_->device_);
@@ -746,13 +1256,37 @@ uint32_t RaycastingScene::AddTriangles(const core::Tensor& vertex_positions,
 
     {
         auto data = vertex_positions.Contiguous();
-        memcpy(vertex_buffer, data.GetDataPtr(),
-               sizeof(float) * 3 * num_vertices);
+#ifdef BUILD_SYCL_MODULE
+        if (impl_->tensor_device_.IsSYCL()) {
+            dynamic_cast<RaycastingScene::SYCLImpl*>(impl_.get())
+                    ->queue_
+                    .memcpy(vertex_buffer, data.GetDataPtr(),
+                            sizeof(float) * 3 * num_vertices)
+                    .wait();
+        } else {
+#endif
+            std::memcpy(vertex_buffer, data.GetDataPtr(),
+                        sizeof(float) * 3 * num_vertices);
+#ifdef BUILD_SYCL_MODULE
+        }
+#endif
     }
     {
         auto data = triangle_indices.Contiguous();
-        memcpy(index_buffer, data.GetDataPtr(),
-               sizeof(uint32_t) * 3 * num_triangles);
+#ifdef BUILD_SYCL_MODULE
+        if (impl_->tensor_device_.IsSYCL()) {
+            dynamic_cast<RaycastingScene::SYCLImpl*>(impl_.get())
+                    ->queue_
+                    .memcpy(index_buffer, data.GetDataPtr(),
+                            sizeof(uint32_t) * 3 * num_triangles)
+                    .wait();
+        } else {
+#endif
+            std::memcpy(index_buffer, data.GetDataPtr(),
+                        sizeof(uint32_t) * 3 * num_triangles);
+#ifdef BUILD_SYCL_MODULE
+        }
+#endif
     }
     rtcSetGeometryEnableFilterFunctionFromArguments(geom, true);
     rtcCommitGeometry(geom);
@@ -760,9 +1294,10 @@ uint32_t RaycastingScene::AddTriangles(const core::Tensor& vertex_positions,
     uint32_t geom_id = rtcAttachGeometry(impl_->scene_, geom);
     rtcReleaseGeometry(geom);
 
-    impl_->geometry_ptrs_.push_back(std::make_tuple(RTC_GEOMETRY_TYPE_TRIANGLE,
-                                                    (const void*)vertex_buffer,
-                                                    (const void*)index_buffer));
+    GeometryPtr geometry_ptr = {RTC_GEOMETRY_TYPE_TRIANGLE,
+                                (const void*)vertex_buffer,
+                                (const void*)index_buffer};
+    impl_->geometry_ptrs_.push_back(geometry_ptr);
     return geom_id;
 }
 
@@ -787,22 +1322,26 @@ std::unordered_map<std::string, core::Tensor> RaycastingScene::CastRays(
     size_t num_rays = shape.NumElements();
 
     std::unordered_map<std::string, core::Tensor> result;
-    result["t_hit"] = core::Tensor(shape, core::Float32);
-    result["geometry_ids"] = core::Tensor(shape, core::UInt32);
-    result["primitive_ids"] = core::Tensor(shape, core::UInt32);
+    result["t_hit"] = core::Tensor(shape, core::Float32, rays.GetDevice());
+    result["geometry_ids"] =
+            core::Tensor(shape, core::UInt32, rays.GetDevice());
+    result["primitive_ids"] =
+            core::Tensor(shape, core::UInt32, rays.GetDevice());
     shape.push_back(2);
-    result["primitive_uvs"] = core::Tensor(shape, core::Float32);
+    result["primitive_uvs"] =
+            core::Tensor(shape, core::Float32, rays.GetDevice());
     shape.back() = 3;
-    result["primitive_normals"] = core::Tensor(shape, core::Float32);
+    result["primitive_normals"] =
+            core::Tensor(shape, core::Float32, rays.GetDevice());
 
     auto data = rays.Contiguous();
-    impl_->CastRays<false>(data.GetDataPtr<float>(), num_rays,
-                           result["t_hit"].GetDataPtr<float>(),
-                           result["geometry_ids"].GetDataPtr<uint32_t>(),
-                           result["primitive_ids"].GetDataPtr<uint32_t>(),
-                           result["primitive_uvs"].GetDataPtr<float>(),
-                           result["primitive_normals"].GetDataPtr<float>(),
-                           nthreads);
+    impl_->CastRays(data.GetDataPtr<float>(), num_rays,
+                    result["t_hit"].GetDataPtr<float>(),
+                    result["geometry_ids"].GetDataPtr<uint32_t>(),
+                    result["primitive_ids"].GetDataPtr<uint32_t>(),
+                    result["primitive_uvs"].GetDataPtr<float>(),
+                    result["primitive_normals"].GetDataPtr<float>(), nthreads,
+                    false);
 
     return result;
 }
@@ -818,7 +1357,7 @@ core::Tensor RaycastingScene::TestOcclusions(const core::Tensor& rays,
                        // results.
     size_t num_rays = shape.NumElements();
 
-    core::Tensor result(shape, core::Bool);
+    core::Tensor result(shape, core::Bool, rays.GetDevice());
 
     auto data = rays.Contiguous();
     impl_->TestOcclusions(data.GetDataPtr<float>(), num_rays, tnear, tfar,
@@ -837,7 +1376,8 @@ core::Tensor RaycastingScene::CountIntersections(const core::Tensor& rays,
                        // results.
     size_t num_rays = shape.NumElements();
 
-    core::Tensor intersections(shape, core::Dtype::FromType<int>());
+    core::Tensor intersections(shape, core::Dtype::FromType<int>(),
+                               impl_->tensor_device_);
 
     auto data = rays.Contiguous();
 
@@ -858,49 +1398,57 @@ RaycastingScene::ListIntersections(const core::Tensor& rays,
     size_t num_rays = shape.NumElements();
 
     // determine total number of intersections
-    core::Tensor intersections(shape, core::Dtype::FromType<int>());
-    core::Tensor track_intersections(shape, core::Dtype::FromType<uint32_t>());
+    core::Tensor intersections(shape, core::Dtype::FromType<int>(),
+                               impl_->tensor_device_);
+    core::Tensor track_intersections(shape, core::Dtype::FromType<uint32_t>(),
+                                     impl_->tensor_device_);
     auto data = rays.Contiguous();
     impl_->CountIntersections(data.GetDataPtr<float>(), num_rays,
                               intersections.GetDataPtr<int>(), nthreads);
 
     // prepare shape with that number of elements
-    Eigen::Map<Eigen::VectorXi> intersections_vector(
-            intersections.GetDataPtr<int>(), num_rays);
-    size_t num_intersections = intersections_vector.sum();
+    int* data_ptr = intersections.GetDataPtr<int>();
+    size_t num_intersections = 0;
+    impl_->ArraySum(data_ptr, num_rays, num_intersections);
 
     // prepare ray allocations (cumsum)
-    Eigen::VectorXi cumsum = Eigen::MatrixXi::Zero(num_rays, 1);
-    std::partial_sum(intersections_vector.begin(),
-                     intersections_vector.end() - 1, cumsum.begin() + 1,
-                     std::plus<int>());
+    core::Tensor cumsum_tensor_cpu =
+            core::Tensor::Zeros(shape, core::Dtype::FromType<int>());
+    core::Tensor cumsum_tensor = cumsum_tensor_cpu.To(impl_->tensor_device_);
+    int* cumsum_ptr = cumsum_tensor.GetDataPtr<int>();
+    impl_->ArrayPartialSum(data_ptr, cumsum_ptr, num_rays);
 
     // generate results structure
     std::unordered_map<std::string, core::Tensor> result;
     shape.clear();
     shape.push_back(num_rays + 1);
     result["ray_splits"] = core::Tensor(shape, core::UInt32);
+
     uint32_t* ptr = result["ray_splits"].GetDataPtr<uint32_t>();
-    for (int i = 0; i < cumsum.size(); ++i) {
-        ptr[i] = cumsum[i];
-    }
+    impl_->CopyArray(cumsum_ptr, ptr, num_rays);
+
     ptr[num_rays] = num_intersections;
-    shape[0] = intersections_vector.sum();
-    result["ray_ids"] = core::Tensor(shape, core::UInt32);
-    result["geometry_ids"] = core::Tensor(shape, core::UInt32);
-    result["primitive_ids"] = core::Tensor(shape, core::UInt32);
-    result["t_hit"] = core::Tensor(shape, core::Float32);
+    shape[0] = num_intersections;
+    result["ray_ids"] =
+            core::Tensor(shape, core::UInt32, impl_->tensor_device_);
+    result["geometry_ids"] =
+            core::Tensor(shape, core::UInt32, impl_->tensor_device_);
+    result["primitive_ids"] =
+            core::Tensor(shape, core::UInt32, impl_->tensor_device_);
+    result["t_hit"] = core::Tensor(shape, core::Float32, impl_->tensor_device_);
     shape.push_back(2);
-    result["primitive_uvs"] = core::Tensor(shape, core::Float32);
+    result["primitive_uvs"] =
+            core::Tensor(shape, core::Float32, impl_->tensor_device_);
 
     impl_->ListIntersections(data.GetDataPtr<float>(), num_rays,
-                             num_intersections, cumsum,
+                             num_intersections, cumsum_ptr,
                              track_intersections.GetDataPtr<uint32_t>(),
                              result["ray_ids"].GetDataPtr<uint32_t>(),
                              result["geometry_ids"].GetDataPtr<uint32_t>(),
                              result["primitive_ids"].GetDataPtr<uint32_t>(),
                              result["primitive_uvs"].GetDataPtr<float>(),
                              result["t_hit"].GetDataPtr<float>(), nthreads);
+
     return result;
 }
 
diff --git a/cpp/open3d/t/geometry/RaycastingScene.h b/cpp/open3d/t/geometry/RaycastingScene.h
index f25d994b0b5..3a533aba96e 100644
--- a/cpp/open3d/t/geometry/RaycastingScene.h
+++ b/cpp/open3d/t/geometry/RaycastingScene.h
@@ -30,14 +30,16 @@ namespace geometry {
 class RaycastingScene {
 public:
     /// \brief Default Constructor.
-    RaycastingScene(int64_t nthreads = 0);
+    RaycastingScene(int64_t nthreads = 0,
+                    const core::Device &device = core::Device("CPU:0"));
 
     ~RaycastingScene();
 
     /// \brief Add a triangle mesh to the scene.
     /// \param vertex_positions Vertices as Tensor of dim {N,3} and dtype float.
     /// \param triangle_indices Triangles as Tensor of dim {M,3} and dtype
-    /// uint32_t. \return The geometry ID of the added mesh.
+    /// uint32_t.
+    /// \return The geometry ID of the added mesh.
     uint32_t AddTriangles(const core::Tensor &vertex_positions,
                           const core::Tensor &triangle_indices);
 
@@ -250,6 +252,10 @@ class RaycastingScene {
 
 private:
     struct Impl;
+    struct CPUImpl;
+#ifdef BUILD_SYCL_MODULE
+    struct SYCLImpl;
+#endif
     std::unique_ptr<Impl> impl_;
 };
 
diff --git a/cpp/pybind/CMakeLists.txt b/cpp/pybind/CMakeLists.txt
index c79bbd96719..6efae9a17fd 100644
--- a/cpp/pybind/CMakeLists.txt
+++ b/cpp/pybind/CMakeLists.txt
@@ -106,7 +106,7 @@ endif()
 # Include additional libraries that may be absent from the user system
 # eg: libc++.so, libc++abi.so (needed by filament) for Linux.
 # libc++.so is a linker script including libc++.so.1 and libc++abi.so, so append 1 to libc++.so
-set(PYTHON_EXTRA_LIBRARIES $<TARGET_FILE:tbb>)
+set(PYTHON_EXTRA_LIBRARIES $<TARGET_FILE:TBB::tbb>)
 if (BUILD_GUI AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
     list(APPEND PYTHON_EXTRA_LIBRARIES ${CPP_LIBRARY}.1 ${CPPABI_LIBRARY})
 endif()
diff --git a/cpp/pybind/core/sycl_utils.cpp b/cpp/pybind/core/sycl_utils.cpp
index 0acf8c37a5b..e50d82b8f53 100644
--- a/cpp/pybind/core/sycl_utils.cpp
+++ b/cpp/pybind/core/sycl_utils.cpp
@@ -13,7 +13,20 @@ namespace open3d {
 namespace core {
 
 void pybind_sycl_utils_definitions(py::module& m) {
-    m.def("sycl_demo", &sycl::SYCLDemo);
+    m.def("sycl_demo", &sy::SYCLDemo);
+
+    py::module m_sycl = m.def_submodule("sycl");
+    m_sycl.def("is_available", sy::IsAvailable,
+               "Returns true if Open3D is compiled with SYCL support and at "
+               "least one compatible SYCL device is detected.");
+
+    m_sycl.def("get_available_devices", sy::GetAvailableSYCLDevices,
+               "Return a list of available SYCL devices.");
+
+    m_sycl.def("enable_persistent_jit_cache", sy::enablePersistentJITCache,
+               "Enables the JIT cache for SYCL. This sets an environment "
+               "variable and "
+               "will affect the entire process and any child processes.");
 }
 
 }  // namespace core
diff --git a/cpp/pybind/t/geometry/raycasting_scene.cpp b/cpp/pybind/t/geometry/raycasting_scene.cpp
index b3e8983aaef..b611dfcfbb0 100644
--- a/cpp/pybind/t/geometry/raycasting_scene.cpp
+++ b/cpp/pybind/t/geometry/raycasting_scene.cpp
@@ -22,7 +22,7 @@ or compute the closest point on the surface of a mesh with respect to one
 or more query points.
 It builds an internal acceleration structure to speed up those queries.
 
-This class supports only the CPU device.
+This class supports the CPU device and SYCL GPU device.
 
 The following shows how to create a scene and compute ray intersections::
 
@@ -58,11 +58,13 @@ void pybind_raycasting_scene_definitions(py::module& m) {
     auto raycasting_scene =
             static_cast<py::class_<RaycastingScene>>(m.attr("RaycastingScene"));
     // Constructors.
-    raycasting_scene.def(py::init<int64_t>(), "nthreads"_a = 0, R"doc(
+    raycasting_scene.def(py::init<int64_t, core::Device>(), "nthreads"_a = 0,
+                         "device"_a = core::Device("CPU:0"), R"doc(
 Create a RaycastingScene.
 
 Args:
     nthreads (int): The number of threads to use for building the scene. Set to 0 for automatic.
+    device (open3d.core.Device): The device to use. Currently CPU and SYCL devices are supported.
 )doc");
 
     raycasting_scene.def(
diff --git a/cpp/tests/core/SYCLUtils.cpp b/cpp/tests/core/SYCLUtils.cpp
index d10d99f09fa..e4462f53be9 100644
--- a/cpp/tests/core/SYCLUtils.cpp
+++ b/cpp/tests/core/SYCLUtils.cpp
@@ -18,18 +18,18 @@
 namespace open3d {
 namespace tests {
 
-TEST(SYCLUtils, SYCLDemo) { core::sycl::SYCLDemo(); }
+TEST(SYCLUtils, SYCLDemo) { core::sy::SYCLDemo(); }
 
 TEST(SYCLUtils, PrintAllSYCLDevices) {
-    core::sycl::PrintSYCLDevices(/*print_all=*/true);
+    core::sy::PrintSYCLDevices(/*print_all=*/true);
 }
 
 TEST(SYCLUtils, PrintSYCLDevices) {
-    core::sycl::PrintSYCLDevices(/*print_all=*/false);
+    core::sy::PrintSYCLDevices(/*print_all=*/false);
 }
 
 TEST(SYCLUtils, SYCLUnifiedSharedMemory) {
-    if (!core::sycl::IsAvailable()) {
+    if (!core::sy::IsAvailable()) {
         return;
     }
 
diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
old mode 100644
new mode 100755
index 73feb7bffd6..ac98e50236f
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
@@ -89,7 +89,7 @@ RUN apt-get update && apt-get install -y \
     liblzma-dev \
  && rm -rf /var/lib/apt/lists/*
 
-# pyenv or Intel Python
+# pyenv
 # The pyenv python paths are used during docker run, in this way docker run
 # does not need to activate the environment again.
 # The soft link from the python patch level version to the python mino version
@@ -97,15 +97,12 @@ RUN apt-get update && apt-get install -y \
 # which patch level pyenv will install (latest).
 ENV PYENV_ROOT=/root/.pyenv
 ENV PATH="$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PYENV_ROOT/versions/$PYTHON_VERSION/bin:$PATH"
-ENV PATH="/opt/intel/oneapi/intelpython/latest/bin:${PATH}"
-RUN if [ "${BUILD_SYCL_MODULE}" = "OFF" ]; then \
-        curl https://pyenv.run | bash \
+RUN curl https://pyenv.run | bash \
         && pyenv update \
         && pyenv install $PYTHON_VERSION \
         && pyenv global $PYTHON_VERSION \
         && pyenv rehash \
-        && ln -s $PYENV_ROOT/versions/${PYTHON_VERSION}* $PYENV_ROOT/versions/${PYTHON_VERSION}; \
-    fi
+        && ln -s $PYENV_ROOT/versions/${PYTHON_VERSION}* $PYENV_ROOT/versions/${PYTHON_VERSION};
 RUN python --version && pip --version
 
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
@@ -191,9 +188,11 @@ RUN \
         export CMAKE_CXX_COMPILER=icpx; \
         export CMAKE_C_COMPILER=icx; \
         export GLIBCXX_USE_CXX11_ABI=ON; \
+        export BUILD_ISPC_MODULE=OFF; \
     else \
         export CMAKE_CXX_COMPILER=g++; \
         export CMAKE_C_COMPILER=gcc; \
+        export BUILD_ISPC_MODULE=ON; \
         # TODO: PyTorch still use old CXX ABI, remove this line when PyTorch is updated
         if [ "$BUILD_PYTORCH_OPS" = "ON" ]; then \
             export GLIBCXX_USE_CXX11_ABI=OFF; \
@@ -208,11 +207,12 @@ RUN \
           -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} \
           -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} \
           -DBUILD_SYCL_MODULE=${BUILD_SYCL_MODULE} \
+          -DBUILD_ISPC_MODULE=${BUILD_ISPC_MODULE} \
           -DDEVELOPER_BUILD=${DEVELOPER_BUILD} \
           -DBUILD_LIBREALSENSE=ON \
           -DBUILD_CUDA_MODULE=${BUILD_CUDA_MODULE} \
           -DBUILD_COMMON_CUDA_ARCHS=ON \
-          -DBUILD_COMMON_ISPC_ISAS=ON \
+          -DBUILD_COMMON_ISPC_ISAS=${BUILD_ISPC_MODULE} \
           -DGLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI} \
           -DBUILD_TENSORFLOW_OPS=${BUILD_TENSORFLOW_OPS} \
           -DBUILD_PYTORCH_OPS=${BUILD_PYTORCH_OPS} \
diff --git a/docker/docker_build.sh b/docker/docker_build.sh
index c92aed26829..56cc75fb030 100755
--- a/docker/docker_build.sh
+++ b/docker/docker_build.sh
@@ -82,7 +82,7 @@ HOST_OPEN3D_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. >/dev/null 2>&1 && pw
 
 # Shared variables
 CCACHE_VERSION=4.3
-CMAKE_VERSION=cmake-3.24.4-linux-x86_64
+CMAKE_VERSION=cmake-3.29.2-linux-x86_64
 CMAKE_VERSION_AARCH64=cmake-3.24.4-linux-aarch64
 CUDA_VERSION=12.1.0-cudnn8
 CUDA_VERSION_LATEST=12.1.0-cudnn8
@@ -447,8 +447,8 @@ sycl-shared_export_env() {
     export DOCKER_TAG=open3d-ci:sycl-shared
 
     # https://hub.docker.com/r/intel/oneapi-basekit
-    # https://github.com/intel/oneapi-containers/blob/main/images/docker/basekit/Dockerfile.ubuntu-20.04
-    export BASE_IMAGE=intel/oneapi-basekit:2022.2-devel-ubuntu20.04
+    # https://github.com/intel/oneapi-containers/blob/master/images/docker/basekit/Dockerfile.ubuntu-20.04
+    export BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
     export DEVELOPER_BUILD=ON
     export CCACHE_TAR_NAME=open3d-ci-sycl
     export PYTHON_VERSION=3.8
@@ -464,8 +464,8 @@ sycl-static_export_env() {
     export DOCKER_TAG=open3d-ci:sycl-static
 
     # https://hub.docker.com/r/intel/oneapi-basekit
-    # https://github.com/intel/oneapi-containers/blob/main/images/docker/basekit/Dockerfile.ubuntu-20.04
-    export BASE_IMAGE=intel/oneapi-basekit:2022.2-devel-ubuntu20.04
+    # https://github.com/intel/oneapi-containers/blob/master/images/docker/basekit/Dockerfile.ubuntu-20.04
+    export BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04
     export DEVELOPER_BUILD=ON
     export CCACHE_TAR_NAME=open3d-ci-sycl
     export PYTHON_VERSION=3.8
diff --git a/examples/cpp/SYCLDemo.cpp b/examples/cpp/SYCLDemo.cpp
index 2fcd8c55dd1..d72db5ea250 100644
--- a/examples/cpp/SYCLDemo.cpp
+++ b/examples/cpp/SYCLDemo.cpp
@@ -8,6 +8,6 @@
 #include "open3d/core/SYCLUtils.h"
 
 int main() {
-    open3d::core::sycl::SYCLDemo();
+    open3d::core::sy::SYCLDemo();
     return 0;
 }
diff --git a/python/test/open3d_test.py b/python/test/open3d_test.py
old mode 100644
new mode 100755
index 46a768746d0..e6997f7a418
--- a/python/test/open3d_test.py
+++ b/python/test/open3d_test.py
@@ -5,14 +5,6 @@
 # SPDX-License-Identifier: MIT
 # ----------------------------------------------------------------------------
 
-import os
-import sys
-import urllib.request
-import zipfile
-
-import numpy as np
-import pytest
-
 
 def torch_available():
     try:
@@ -23,20 +15,21 @@ def torch_available():
     return True
 
 
-def list_devices():
+def list_devices(enable_cuda=True, enable_sycl=False):
     """
-    If Open3D is built with CUDA support:
-    - If cuda device is available, returns [Device("CPU:0"), Device("CUDA:0")].
-    - If cuda device is not available, returns [Device("CPU:0")].
-
-    If Open3D is built without CUDA support:
-    - returns [Device("CPU:0")].
+    Returns a list of devices that are available for Open3D to use:
+    - Device("CPU:0")
+    - Device("CUDA:0") if built with CUDA support and a CUDA device is available.
+    - Device("SYCL:0") if built with SYCL support and a SYCL device is available.
     """
     import open3d as o3d
-    if o3d.core.cuda.device_count() > 0:
-        return [o3d.core.Device("CPU:0"), o3d.core.Device("CUDA:0")]
-    else:
-        return [o3d.core.Device("CPU:0")]
+
+    devices = [o3d.core.Device("CPU:0")]
+    if enable_cuda and o3d.core.cuda.device_count() > 0:
+        devices.append(o3d.core.Device("CUDA:0"))
+    if enable_sycl and o3d.core.sycl.is_available():
+        return devices.append(o3d.core.Device("SYCL:0"))
+    return devices
 
 
 def list_devices_with_torch():
diff --git a/python/test/t/geometry/test_raycasting_scene.py b/python/test/t/geometry/test_raycasting_scene.py
old mode 100644
new mode 100755
index 3ce024a2b29..3fcb9ea2d28
--- a/python/test/t/geometry/test_raycasting_scene.py
+++ b/python/test/t/geometry/test_raycasting_scene.py
@@ -9,57 +9,86 @@
 import numpy as np
 import pytest
 
+import sys
+import os
+
+sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../..")
+from open3d_test import list_devices
+
 
 # test intersection with a single triangle
-def test_cast_rays():
+@pytest.mark.parametrize("device",
+                         list_devices(enable_cuda=False, enable_sycl=True))
+def test_cast_rays(device):
     vertices = o3d.core.Tensor([[0, 0, 0], [1, 0, 0], [1, 1, 0]],
-                               dtype=o3d.core.float32)
-    triangles = o3d.core.Tensor([[0, 1, 2]], dtype=o3d.core.uint32)
+                               dtype=o3d.core.float32,
+                               device=device)
+    triangles = o3d.core.Tensor([[0, 1, 2]],
+                                dtype=o3d.core.uint32,
+                                device=device)
 
-    scene = o3d.t.geometry.RaycastingScene()
+    scene = o3d.t.geometry.RaycastingScene(device=device)
     geom_id = scene.add_triangles(vertices, triangles)
 
-    rays = o3d.core.Tensor([[0.2, 0.1, 1, 0, 0, -1], [10, 10, 10, 1, 0, 0]],
-                           dtype=o3d.core.float32)
+    rays = o3d.core.Tensor(
+        [[0.2, 0.1, 1, 0, 0, -1], [10, 10, 10, 1, 0, 0]],
+        dtype=o3d.core.float32,
+        device=device,
+    )
     ans = scene.cast_rays(rays)
 
     # first ray hits the triangle
-    assert geom_id == ans['geometry_ids'][0]
-    assert np.isclose(ans['t_hit'][0].item(), 1.0)
+    assert geom_id == ans["geometry_ids"][0].cpu()
+    assert np.isclose(ans["t_hit"][0].item(), 1.0)
 
     # second ray misses
-    assert o3d.t.geometry.RaycastingScene.INVALID_ID == ans['geometry_ids'][1]
-    assert np.isinf(ans['t_hit'][1].item())
+    assert o3d.t.geometry.RaycastingScene.INVALID_ID == ans["geometry_ids"][
+        1].cpu()
+    assert np.isinf(ans["t_hit"][1].item())
 
 
 # cast lots of random rays to test the internal batching
 # we expect no errors for this test
-def test_cast_lots_of_rays():
+@pytest.mark.parametrize("device",
+                         list_devices(enable_cuda=False, enable_sycl=True))
+def test_cast_lots_of_rays(device):
     vertices = o3d.core.Tensor([[0, 0, 0], [1, 0, 0], [1, 1, 0]],
-                               dtype=o3d.core.float32)
-    triangles = o3d.core.Tensor([[0, 1, 2]], dtype=o3d.core.uint32)
+                               dtype=o3d.core.float32,
+                               device=device)
+    triangles = o3d.core.Tensor([[0, 1, 2]],
+                                dtype=o3d.core.uint32,
+                                device=device)
 
-    scene = o3d.t.geometry.RaycastingScene()
+    scene = o3d.t.geometry.RaycastingScene(device=device)
     scene.add_triangles(vertices, triangles)
 
     rs = np.random.RandomState(123)
     rays = o3d.core.Tensor.from_numpy(rs.rand(7654321, 6).astype(np.float32))
+    rays = rays.to(device)
 
     _ = scene.cast_rays(rays)
 
 
 # test occlusion with a single triangle
-def test_test_occlusions():
+@pytest.mark.parametrize("device",
+                         list_devices(enable_cuda=False, enable_sycl=True))
+def test_test_occlusions(device):
     vertices = o3d.core.Tensor([[0, 0, 0], [1, 0, 0], [1, 1, 0]],
-                               dtype=o3d.core.float32)
-    triangles = o3d.core.Tensor([[0, 1, 2]], dtype=o3d.core.uint32)
+                               dtype=o3d.core.float32,
+                               device=device)
+    triangles = o3d.core.Tensor([[0, 1, 2]],
+                                dtype=o3d.core.uint32,
+                                device=device)
 
-    scene = o3d.t.geometry.RaycastingScene()
+    scene = o3d.t.geometry.RaycastingScene(device=device)
     scene.add_triangles(vertices, triangles)
 
-    rays = o3d.core.Tensor([[0.2, 0.1, 1, 0, 0, -1], [10, 10, 10, 1, 0, 0]],
-                           dtype=o3d.core.float32)
-    ans = scene.test_occlusions(rays)
+    rays = o3d.core.Tensor(
+        [[0.2, 0.1, 1, 0, 0, -1], [10, 10, 10, 1, 0, 0]],
+        dtype=o3d.core.float32,
+        device=device,
+    )
+    ans = scene.test_occlusions(rays).cpu()
 
     # first ray is occluded by the triangle
     assert ans[0] == True
@@ -68,88 +97,124 @@ def test_test_occlusions():
     assert ans[1] == False
 
     # set tfar such that no ray is occluded
-    ans = scene.test_occlusions(rays, tfar=0.5)
+    ans = scene.test_occlusions(rays, tfar=0.5).cpu()
     assert ans.any() == False
 
     # set tnear such that no ray is occluded
-    ans = scene.test_occlusions(rays, tnear=1.5)
+    ans = scene.test_occlusions(rays, tnear=1.5).cpu()
     assert ans.any() == False
 
 
 # test lots of random rays for occlusions to test the internal batching
 # we expect no errors for this test
-def test_test_lots_of_occlusions():
+@pytest.mark.parametrize("device",
+                         list_devices(enable_cuda=False, enable_sycl=True))
+def test_test_lots_of_occlusions(device):
     vertices = o3d.core.Tensor([[0, 0, 0], [1, 0, 0], [1, 1, 0]],
-                               dtype=o3d.core.float32)
-    triangles = o3d.core.Tensor([[0, 1, 2]], dtype=o3d.core.uint32)
+                               dtype=o3d.core.float32,
+                               device=device)
+    triangles = o3d.core.Tensor([[0, 1, 2]],
+                                dtype=o3d.core.uint32,
+                                device=device)
 
-    scene = o3d.t.geometry.RaycastingScene()
+    scene = o3d.t.geometry.RaycastingScene(device=device)
     scene.add_triangles(vertices, triangles)
 
     rs = np.random.RandomState(123)
     rays = o3d.core.Tensor.from_numpy(rs.rand(7654321, 6).astype(np.float32))
+    rays = rays.to(device)
 
     _ = scene.test_occlusions(rays)
 
 
-def test_add_triangle_mesh():
-    cube = o3d.t.geometry.TriangleMesh.from_legacy(
-        o3d.geometry.TriangleMesh.create_box())
+@pytest.mark.parametrize("device",
+                         list_devices(enable_cuda=False, enable_sycl=True))
+def test_add_triangle_mesh(device):
+    cube = o3d.t.geometry.TriangleMesh.create_box()
+    cube = cube.to(device)
 
-    scene = o3d.t.geometry.RaycastingScene()
+    scene = o3d.t.geometry.RaycastingScene(device=device)
     scene.add_triangles(cube)
 
-    rays = o3d.core.Tensor([[0.5, 0.5, -1, 0, 0, 1], [0.5, 0.5, 0.5, 0, 0, 1],
-                            [10, 10, 10, 1, 0, 0]],
-                           dtype=o3d.core.float32)
+    rays = o3d.core.Tensor(
+        [[0.5, 0.5, -1, 0, 0, 1], [0.5, 0.5, 0.5, 0, 0, 1],
+         [10, 10, 10, 1, 0, 0]],
+        dtype=o3d.core.float32,
+        device=device,
+    )
     ans = scene.count_intersections(rays)
 
-    np.testing.assert_equal(ans.numpy(), [2, 1, 0])
+    np.testing.assert_equal(ans.cpu().numpy(), [2, 1, 0])
 
 
-def test_count_intersections():
-    cube = o3d.t.geometry.TriangleMesh.from_legacy(
-        o3d.geometry.TriangleMesh.create_box())
+@pytest.mark.parametrize("device",
+                         list_devices(enable_cuda=False, enable_sycl=True))
+def test_count_intersections(device):
+    cube = o3d.t.geometry.TriangleMesh.create_box()
+    vertex_positions = cube.vertex.positions
+    vertex_positions = vertex_positions.to(device)
+    triangle_indices = cube.triangle.indices
+    triangle_indices = triangle_indices.to(o3d.core.Dtype.UInt32)
+    triangle_indices = triangle_indices.to(device)
 
-    scene = o3d.t.geometry.RaycastingScene()
-    scene.add_triangles(cube)
+    scene = o3d.t.geometry.RaycastingScene(device=device)
+    scene.add_triangles(vertex_positions, triangle_indices)
 
-    rays = o3d.core.Tensor([[0.5, 0.5, -1, 0, 0, 1], [0.5, 0.5, 0.5, 0, 0, 1],
-                            [10, 10, 10, 1, 0, 0]],
-                           dtype=o3d.core.float32)
+    rays = o3d.core.Tensor(
+        [[0.5, 0.5, -1, 0, 0, 1], [0.5, 0.5, 0.5, 0, 0, 1],
+         [10, 10, 10, 1, 0, 0]],
+        dtype=o3d.core.float32,
+        device=device,
+    )
     ans = scene.count_intersections(rays)
 
-    np.testing.assert_equal(ans.numpy(), [2, 1, 0])
+    np.testing.assert_equal(ans.cpu().numpy(), [2, 1, 0])
 
 
 # count lots of random ray intersections to test the internal batching
 # we expect no errors for this test
-def test_count_lots_of_intersections():
-    cube = o3d.t.geometry.TriangleMesh.from_legacy(
-        o3d.geometry.TriangleMesh.create_box())
-
-    scene = o3d.t.geometry.RaycastingScene()
-    scene.add_triangles(cube)
+@pytest.mark.parametrize("device",
+                         list_devices(enable_cuda=False, enable_sycl=True))
+def test_count_lots_of_intersections(device):
+    cube = o3d.t.geometry.TriangleMesh.create_box()
+    vertex_positions = cube.vertex.positions
+    vertex_positions = vertex_positions.to(device)
+    triangle_indices = cube.triangle.indices
+    triangle_indices = triangle_indices.to(o3d.core.Dtype.UInt32)
+    triangle_indices = triangle_indices.to(device)
+
+    scene = o3d.t.geometry.RaycastingScene(device=device)
+    scene.add_triangles(vertex_positions, triangle_indices)
 
     rs = np.random.RandomState(123)
     rays = o3d.core.Tensor.from_numpy(rs.rand(1234567, 6).astype(np.float32))
+    rays = rays.to(device)
 
     _ = scene.count_intersections(rays)
 
 
-def test_list_intersections():
-    cube = o3d.t.geometry.TriangleMesh.from_legacy(
-        o3d.geometry.TriangleMesh.create_box())
-
-    scene = o3d.t.geometry.RaycastingScene()
-    scene.add_triangles(cube)
-
-    rays = o3d.core.Tensor([[0.5, 0.5, -1, 0, 0, 1], [0.5, 0.5, 0.5, 0, 0, 1],
-                            [10, 10, 10, 1, 0, 0]],
-                           dtype=o3d.core.float32)
+@pytest.mark.parametrize("device",
+                         list_devices(enable_cuda=False, enable_sycl=True))
+def test_list_intersections(device):
+    cube = o3d.t.geometry.TriangleMesh.create_box()
+    vertex_positions = cube.vertex.positions
+    vertex_positions = vertex_positions.to(device)
+    triangle_indices = cube.triangle.indices
+    triangle_indices = triangle_indices.to(o3d.core.Dtype.UInt32)
+    triangle_indices = triangle_indices.to(device)
+
+    scene = o3d.t.geometry.RaycastingScene(device=device)
+    scene.add_triangles(vertex_positions, triangle_indices)
+
+    rays = o3d.core.Tensor(
+        [[0.5, 0.5, -1, 0, 0, 1], [0.5, 0.5, 0.5, 0, 0, 1],
+         [10, 10, 10, 1, 0, 0]],
+        dtype=o3d.core.float32,
+        device=device,
+    )
     ans = scene.list_intersections(rays)
 
-    np.testing.assert_allclose(ans['t_hit'].numpy(),
+    np.testing.assert_allclose(ans["t_hit"].cpu().numpy(),
                                np.array([1.0, 2.0, 0.5]),
                                rtol=1e-6,
                                atol=1e-6)
@@ -157,15 +222,22 @@ def test_list_intersections():
 
 # list lots of random ray intersections to test the internal batching
 # we expect no errors for this test
-def test_list_lots_of_intersections():
-    cube = o3d.t.geometry.TriangleMesh.from_legacy(
-        o3d.geometry.TriangleMesh.create_box())
-
-    scene = o3d.t.geometry.RaycastingScene()
-    scene.add_triangles(cube)
+@pytest.mark.parametrize("device",
+                         list_devices(enable_cuda=False, enable_sycl=True))
+def test_list_lots_of_intersections(device):
+    cube = o3d.t.geometry.TriangleMesh.create_box()
+    vertex_positions = cube.vertex.positions
+    vertex_positions = vertex_positions.to(device)
+    triangle_indices = cube.triangle.indices
+    triangle_indices = triangle_indices.to(o3d.core.Dtype.UInt32)
+    triangle_indices = triangle_indices.to(device)
+
+    scene = o3d.t.geometry.RaycastingScene(device=device)
+    scene.add_triangles(vertex_positions, triangle_indices)
 
     rs = np.random.RandomState(123)
     rays = o3d.core.Tensor.from_numpy(rs.rand(123456, 6).astype(np.float32))
+    rays = rays.to(device)
 
     _ = scene.list_intersections(rays)
 
@@ -182,12 +254,14 @@ def test_compute_closest_points():
                                    dtype=o3d.core.float32)
     ans = scene.compute_closest_points(query_points)
 
-    assert (geom_id == ans['geometry_ids']).all()
-    assert (0 == ans['primitive_ids']).all()
-    np.testing.assert_allclose(ans['points'].numpy(),
-                               np.array([[0.2, 0.1, 0.0], [1, 1, 0]]),
-                               rtol=1e-6,
-                               atol=1e-6)
+    assert (geom_id == ans["geometry_ids"]).all()
+    assert (0 == ans["primitive_ids"]).all()
+    np.testing.assert_allclose(
+        ans["points"].numpy(),
+        np.array([[0.2, 0.1, 0.0], [1, 1, 0]]),
+        rtol=1e-6,
+        atol=1e-6,
+    )
 
 
 # compute lots of closest points to test the internal batching
@@ -207,8 +281,7 @@ def test_compute_lots_of_closest_points():
 
 
 def test_compute_distance():
-    cube = o3d.t.geometry.TriangleMesh.from_legacy(
-        o3d.geometry.TriangleMesh.create_box())
+    cube = o3d.t.geometry.TriangleMesh.create_box()
 
     scene = o3d.t.geometry.RaycastingScene()
     scene.add_triangles(cube)
@@ -221,8 +294,7 @@ def test_compute_distance():
 
 
 def test_compute_signed_distance():
-    cube = o3d.t.geometry.TriangleMesh.from_legacy(
-        o3d.geometry.TriangleMesh.create_box())
+    cube = o3d.t.geometry.TriangleMesh.create_box()
 
     scene = o3d.t.geometry.RaycastingScene()
     scene.add_triangles(cube)
@@ -235,8 +307,7 @@ def test_compute_signed_distance():
 
 
 def test_compute_occupancy():
-    cube = o3d.t.geometry.TriangleMesh.from_legacy(
-        o3d.geometry.TriangleMesh.create_box())
+    cube = o3d.t.geometry.TriangleMesh.create_box()
 
     scene = o3d.t.geometry.RaycastingScene()
     scene.add_triangles(cube)
@@ -276,53 +347,49 @@ def test_output_shapes(shape):
 
     # some outputs append a specific last dim
     last_dim = {
-        't_hit': [],
-        'geometry_ids': [],
-        'primitive_ids': [],
-        'primitive_uvs': [2],
-        'primitive_normals': [3],
-        'points': [3],
-        'ray_ids': [],
-        'ray_splits': []
+        "t_hit": [],
+        "geometry_ids": [],
+        "primitive_ids": [],
+        "primitive_uvs": [2],
+        "primitive_normals": [3],
+        "points": [3],
+        "ray_ids": [],
+        "ray_splits": [],
     }
 
     ans = scene.cast_rays(rays)
     for k, v in ans.items():
         expected_shape = shape + last_dim[k]
-        assert list(
-            v.shape
-        ) == expected_shape, 'shape mismatch: expected {} but got {} for {}'.format(
-            expected_shape, list(v.shape), k)
+        assert (list(v.shape) == expected_shape
+               ), "shape mismatch: expected {} but got {} for {}".format(
+                   expected_shape, list(v.shape), k)
 
     ans = scene.compute_closest_points(query_points)
     for k, v in ans.items():
         expected_shape = shape + last_dim[k]
-        assert list(
-            v.shape
-        ) == expected_shape, 'shape mismatch: expected {} but got {} for {}'.format(
-            expected_shape, list(v.shape), k)
+        assert (list(v.shape) == expected_shape
+               ), "shape mismatch: expected {} but got {} for {}".format(
+                   expected_shape, list(v.shape), k)
 
     ans = scene.list_intersections(rays)
     nx = np.sum(scene.count_intersections(rays).numpy()).tolist()
     for k, v in ans.items():
-        if k == 'ray_splits':
+        if k == "ray_splits":
             alt_shape = [np.prod(rays.shape[:-1]) + 1]
         else:
             alt_shape = [nx]
-        #use np.append otherwise issues if alt_shape = [0] and last_dim[k] = []
+        # use np.append otherwise issues if alt_shape = [0] and last_dim[k] = []
         expected_shape = np.append(alt_shape, last_dim[k]).tolist()
-        assert list(
-            v.shape
-        ) == expected_shape, 'shape mismatch: expected {} but got {} for {}'.format(
-            expected_shape, list(v.shape), k)
+        assert (list(v.shape) == expected_shape
+               ), "shape mismatch: expected {} but got {} for {}".format(
+                   expected_shape, list(v.shape), k)
 
 
 def test_sphere_wrong_occupancy():
     # This test checks a specific scenario where the old implementation
     # without ray jitter produced wrong results for a sphere because some
     # rays miss hitting exactly a vertex or an edge.
-    mesh = o3d.geometry.TriangleMesh.create_sphere(0.8)
-    mesh = o3d.t.geometry.TriangleMesh.from_legacy(mesh)
+    mesh = o3d.t.geometry.TriangleMesh.create_sphere(0.8)
 
     scene = o3d.t.geometry.RaycastingScene()
     scene.add_triangles(mesh)