diff --git a/3rdparty/embree/embree.cmake b/3rdparty/embree/embree.cmake index 2495c987bbd..6d55198ec69 100644 --- a/3rdparty/embree/embree.cmake +++ b/3rdparty/embree/embree.cmake @@ -64,6 +64,16 @@ else() endif() +if(BUILD_SYCL_MODULE) + set(ISA_ARGS ${ISA_ARGS} -DCMAKE_CXX_COMPILER=icpx) + set(ISA_ARGS ${ISA_ARGS} -DCMAKE_C_COMPILER=icx) + set(ISA_ARGS ${ISA_ARGS} -DEMBREE_SYCL_SUPPORT=ON) + list(APPEND ISA_LIBS embree4_sycl ze_wrapper) + list(APPEND ISA_BUILD_BYPRODUCTS "/${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}embree4_sycl${CMAKE_STATIC_LIBRARY_SUFFIX}" + "/${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}ze_wrapper${CMAKE_STATIC_LIBRARY_SUFFIX}") +endif() + + ExternalProject_Add( ext_embree PREFIX embree @@ -92,10 +102,11 @@ ExternalProject_Add( /${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}sys${CMAKE_STATIC_LIBRARY_SUFFIX} /${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}math${CMAKE_STATIC_LIBRARY_SUFFIX} /${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}tasking${CMAKE_STATIC_LIBRARY_SUFFIX} + /${Open3D_INSTALL_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}ze_wrapper${CMAKE_STATIC_LIBRARY_SUFFIX} ${ISA_BUILD_BYPRODUCTS} ) ExternalProject_Get_Property(ext_embree INSTALL_DIR) set(EMBREE_INCLUDE_DIRS ${INSTALL_DIR}/include/ ${INSTALL_DIR}/src/ext_embree/) # "/" is critical. set(EMBREE_LIB_DIR ${INSTALL_DIR}/${Open3D_INSTALL_LIB_DIR}) -set(EMBREE_LIBRARIES embree4 ${ISA_LIBS} simd lexers sys math tasking) +set(EMBREE_LIBRARIES embree4 simd lexers sys math tasking ${ISA_LIBS}) diff --git a/3rdparty/find_dependencies.cmake b/3rdparty/find_dependencies.cmake index c3dc3885565..d2afbe405c9 100644 --- a/3rdparty/find_dependencies.cmake +++ b/3rdparty/find_dependencies.cmake @@ -1384,7 +1384,7 @@ if(BUILD_GUI) ${CPP_LIBRARY} ${CPPABI_LIBRARY}) message(STATUS "Filament C++ libraries: ${CPP_LIBRARY} ${CPPABI_LIBRARY}") if (LIBCPP_VERSION GREATER 11) - message(WARNING "libc++ (LLVM) version ${LIBCPP_VERSION} > 11 includes libunwind that " + message(WARNING "libc++ (LLVM) version ${LIBCPP_VERSION} > 11 includes libunwind that " "interferes with the system libunwind.so.8 and may crash Python code when exceptions " "are used. Please consider using libc++ (LLVM) v11.") endif() @@ -1697,7 +1697,7 @@ else(OPEN3D_USE_ONEAPI_PACKAGES) INCLUDE_DIRS ${STATIC_MKL_INCLUDE_DIR} LIB_DIR ${STATIC_MKL_LIB_DIR} LIBRARIES ${STATIC_MKL_LIBRARIES} - DEPENDS ext_tbb ext_mkl_include ext_mkl + DEPENDS Open3D::3rdparty_tbb ext_mkl_include ext_mkl ) if(UNIX) target_compile_options(3rdparty_blas INTERFACE "$<$:-m64>") @@ -1719,7 +1719,7 @@ else(OPEN3D_USE_ONEAPI_PACKAGES) endif() if(NOT USE_SYSTEM_TBB) include(${Open3D_3RDPARTY_DIR}/mkl/tbb.cmake) - list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM TBB::tbb) + list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM Open3D::3rdparty_tbb) else() list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_tbb) endif() diff --git a/3rdparty/mkl/tbb.cmake b/3rdparty/mkl/tbb.cmake index 385af4b644b..a36435ee1b4 100644 --- a/3rdparty/mkl/tbb.cmake +++ b/3rdparty/mkl/tbb.cmake @@ -38,3 +38,4 @@ install(TARGETS tbb EXPORT ${PROJECT_NAME}Targets RUNTIME DESTINATION ${Open3D_INSTALL_BIN_DIR} COMPONENT tbb ) +add_library(${PROJECT_NAME}::3rdparty_tbb ALIAS tbb) diff --git a/cpp/open3d/core/Device.cpp b/cpp/open3d/core/Device.cpp index 7137e307874..2681fe8dc79 100644 --- a/cpp/open3d/core/Device.cpp +++ b/cpp/open3d/core/Device.cpp @@ -137,7 +137,7 @@ std::vector Device::GetAvailableCUDADevices() { } std::vector Device::GetAvailableSYCLDevices() { - return sycl::GetAvailableSYCLDevices(); + return sy::GetAvailableSYCLDevices(); } void Device::PrintAvailableDevices() { diff --git a/cpp/open3d/core/Device.h b/cpp/open3d/core/Device.h index 5b04875ed20..13d6715f025 100644 --- a/cpp/open3d/core/Device.h +++ b/cpp/open3d/core/Device.h @@ -21,7 +21,7 @@ class Device { enum class DeviceType { CPU = 0, CUDA = 1, - SYCL = 2, // SYCL gpu_selector(). + SYCL = 2, // SYCL gpu_selector_v. }; /// Default constructor -> "CPU:0". diff --git a/cpp/open3d/core/MemoryManagerSYCL.cpp b/cpp/open3d/core/MemoryManagerSYCL.cpp index c3d642a2c76..cfa92ffc804 100644 --- a/cpp/open3d/core/MemoryManagerSYCL.cpp +++ b/cpp/open3d/core/MemoryManagerSYCL.cpp @@ -5,8 +5,8 @@ // SPDX-License-Identifier: MIT // ---------------------------------------------------------------------------- -#include #include +#include #include #include "open3d/core/Device.h" @@ -18,24 +18,22 @@ namespace open3d { namespace core { -namespace sy = cl::sycl; - void* MemoryManagerSYCL::Malloc(size_t byte_size, const Device& device) { - const sy::queue& queue = - sycl::SYCLContext::GetInstance().GetDefaultQueue(device); + const sycl::queue& queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(device); #ifdef ENABLE_SYCL_UNIFIED_SHARED_MEMORY - return static_cast(sy::malloc_shared(byte_size, queue)); + return static_cast(sycl::malloc_shared(byte_size, queue)); #else - return static_cast(sy::malloc_device(byte_size, queue)); + return static_cast(sycl::malloc_device(byte_size, queue)); #endif } void MemoryManagerSYCL::Free(void* ptr, const Device& device) { if (ptr) { - const sy::queue& queue = - sycl::SYCLContext::GetInstance().GetDefaultQueue(device); - sy::free(ptr, queue); + const sycl::queue& queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(device); + sycl::free(ptr, queue); } } @@ -62,8 +60,8 @@ void MemoryManagerSYCL::Memcpy(void* dst_ptr, dst_device.ToString()); } - sy::queue queue = - sycl::SYCLContext::GetInstance().GetDefaultQueue(device_with_queue); + sycl::queue queue = + sy::SYCLContext::GetInstance().GetDefaultQueue(device_with_queue); queue.memcpy(dst_ptr, src_ptr, num_bytes).wait_and_throw(); } diff --git a/cpp/open3d/core/SYCLContext.cpp b/cpp/open3d/core/SYCLContext.cpp index 660ab8d2bab..6a3007d7dba 100644 --- a/cpp/open3d/core/SYCLContext.cpp +++ b/cpp/open3d/core/SYCLContext.cpp @@ -7,17 +7,17 @@ #include "open3d/core/SYCLContext.h" -#include #include #include #include +#include #include "open3d/core/SYCLUtils.h" #include "open3d/utility/Logging.h" namespace open3d { namespace core { -namespace sycl { +namespace sy { SYCLContext &SYCLContext::GetInstance() { static thread_local SYCLContext instance; @@ -38,7 +38,7 @@ bool SYCLContext::IsDeviceAvailable(const Device &device) { } std::vector SYCLContext::GetAvailableSYCLDevices() { return devices_; } -sy::queue SYCLContext::GetDefaultQueue(const Device &device) { +sycl::queue SYCLContext::GetDefaultQueue(const Device &device) { return device_to_default_queue_.at(device); } @@ -46,12 +46,12 @@ SYCLContext::SYCLContext() { // SYCL GPU. // TODO: Currently we only support one GPU device. try { - const sy::device &sycl_device = sy::device(sy::gpu_selector()); + const sycl::device &sycl_device = sycl::device(sycl::gpu_selector_v); const Device open3d_device = Device("SYCL:0"); devices_.push_back(open3d_device); device_to_sycl_device_[open3d_device] = sycl_device; - device_to_default_queue_[open3d_device] = sy::queue(sycl_device); - } catch (const sy::exception &e) { + device_to_default_queue_[open3d_device] = sycl::queue(sycl_device); + } catch (const sycl::exception &e) { } if (devices_.size() == 0) { @@ -59,15 +59,16 @@ SYCLContext::SYCLContext() { // This could happen if the Intel GPGPU driver is not installed or if // your CPU does not have integrated GPU. try { - const sy::device &sycl_device = sy::device(sy::host_selector()); + const sycl::device &sycl_device = + sycl::device(sycl::cpu_selector_v); const Device open3d_device = Device("SYCL:0"); utility::LogWarning( "SYCL GPU device is not available, falling back to SYCL " "host device."); devices_.push_back(open3d_device); device_to_sycl_device_[open3d_device] = sycl_device; - device_to_default_queue_[open3d_device] = sy::queue(sycl_device); - } catch (const sy::exception &e) { + device_to_default_queue_[open3d_device] = sycl::queue(sycl_device); + } catch (const sycl::exception &e) { } } @@ -76,6 +77,6 @@ SYCLContext::SYCLContext() { } } -} // namespace sycl +} // namespace sy } // namespace core } // namespace open3d diff --git a/cpp/open3d/core/SYCLContext.h b/cpp/open3d/core/SYCLContext.h index e23b9d8770c..b44e2e3fb96 100644 --- a/cpp/open3d/core/SYCLContext.h +++ b/cpp/open3d/core/SYCLContext.h @@ -14,16 +14,14 @@ #pragma once -#include +#include #include #include "open3d/core/Device.h" namespace open3d { namespace core { -namespace sycl { - -namespace sy = cl::sycl; +namespace sy { /// Singleton SYCL context manager. It maintains: /// - A default queue for each SYCL device @@ -45,7 +43,7 @@ class SYCLContext { std::vector GetAvailableSYCLDevices(); /// Get the default SYCL queue given an Open3D device. - sy::queue GetDefaultQueue(const Device& device); + sycl::queue GetDefaultQueue(const Device& device); private: SYCLContext(); @@ -54,12 +52,12 @@ class SYCLContext { std::vector devices_; /// Maps core::Device to the corresponding default SYCL queue. - std::unordered_map device_to_default_queue_; + std::unordered_map device_to_default_queue_; - /// Maps core::Device to sy::device. Internal use only for now. - std::unordered_map device_to_sycl_device_; + /// Maps core::Device to sycl::device. Internal use only for now. + std::unordered_map device_to_sycl_device_; }; -} // namespace sycl +} // namespace sy } // namespace core } // namespace open3d diff --git a/cpp/open3d/core/SYCLUtils.cpp b/cpp/open3d/core/SYCLUtils.cpp index d9e42fdcde0..5860021996d 100644 --- a/cpp/open3d/core/SYCLUtils.cpp +++ b/cpp/open3d/core/SYCLUtils.cpp @@ -22,45 +22,42 @@ #include "open3d/utility/Logging.h" #ifdef BUILD_SYCL_MODULE -#include +#include #include "open3d/core/SYCLContext.h" #endif namespace open3d { namespace core { -namespace sycl { - -#ifdef BUILD_SYCL_MODULE -namespace sy = cl::sycl; -#endif +namespace sy { int SYCLDemo() { #ifdef BUILD_SYCL_MODULE // Ref: https://intel.github.io/llvm-docs/GetStartedGuide.html // Creating buffer of 4 ints to be used inside the kernel code. - sy::buffer buffer(4); + sycl::buffer buffer(4); // Creating SYCL queue. - sy::queue q; + sycl::queue q; // Size of index space for kernel. - sy::range<1> num_workloads{buffer.size()}; + sycl::range<1> num_workloads{buffer.size()}; // Submitting command group(work) to q. - q.submit([&](sy::handler &cgh) { + q.submit([&](sycl::handler &cgh) { // Getting write only access to the buffer on a device. - auto accessor = buffer.get_access(cgh); + auto accessor = buffer.get_access(cgh); // Execute kernel. - cgh.parallel_for(num_workloads, [=](sy::id<1> WIid) { - // Fill buffer with indexes. - accessor[WIid] = (sy::cl_int)WIid.get(0); - }); + cgh.parallel_for( + num_workloads, [=](sycl::id<1> WIid) { + // Fill buffer with indexes. + accessor[WIid] = (int)WIid.get(0); + }); }); - // Getting read only access to the buffer on the host. + // Getting access to the buffer on the host. // Implicit barrier waiting for q to complete the work. - const auto host_accessor = buffer.get_access(); + const auto host_accessor = buffer.get_host_access(); // Check the results. bool mismatch_found = false; @@ -87,34 +84,34 @@ int SYCLDemo() { #ifdef BUILD_SYCL_MODULE -static std::string GetDeviceTypeName(const sy::device &device) { - auto device_type = device.get_info(); +static std::string GetDeviceTypeName(const sycl::device &device) { + auto device_type = device.get_info(); switch (device_type) { - case sy::info::device_type::cpu: + case sycl::info::device_type::cpu: return "cpu"; - case sy::info::device_type::gpu: + case sycl::info::device_type::gpu: return "gpu"; - case sy::info::device_type::host: + case sycl::info::device_type::host: return "host"; - case sy::info::device_type::accelerator: + case sycl::info::device_type::accelerator: return "acc"; default: return "unknown"; } } -static std::string GetBackendName(const sy::device &device) { - sy::platform platform = device.get_info(); - sy::backend backend = platform.get_backend(); +static std::string GetBackendName(const sycl::device &device) { + sycl::platform platform = device.get_info(); + sycl::backend backend = platform.get_backend(); std::ostringstream os; os << backend; return os.str(); } -static std::string SYCLDeviceToString(const sy::device &device) { +static std::string SYCLDeviceToString(const sycl::device &device) { std::ostringstream os; os << "[" << GetBackendName(device) << ":" << GetDeviceTypeName(device) - << "] " << device.get_info(); + << "] " << device.get_info(); return os.str(); } #endif @@ -131,68 +128,62 @@ void PrintSYCLDevices(bool print_all) { if (print_all) { utility::LogInfo("# All SYCL devices"); - const std::vector &platforms = - sy::platform::get_platforms(); - for (const sy::platform &platform : platforms) { - sy::backend backend = platform.get_backend(); - const std::vector &devices = platform.get_devices(); - for (const sy::device &device : devices) { + const std::vector &platforms = + sycl::platform::get_platforms(); + for (const sycl::platform &platform : platforms) { + sycl::backend backend = platform.get_backend(); + const std::vector &devices = platform.get_devices(); + for (const sycl::device &device : devices) { utility::LogInfo("- {}", SYCLDeviceToString(device)); } } utility::LogInfo("# Default SYCL selectors"); try { - const sy::device &device = sy::device(sy::default_selector()); - utility::LogInfo("- sycl::default_selector() : {}", + const sycl::device &device = sycl::device(sycl::default_selector_v); + utility::LogInfo("- sycl::default_selector_v : {}", SYCLDeviceToString(device)); - } catch (const sy::exception &e) { - utility::LogInfo("- sycl::default_selector() : N/A"); + } catch (const sycl::exception &e) { + utility::LogInfo("- sycl::default_selector_v : N/A"); } try { - const sy::device &device = sy::device(sy::host_selector()); - utility::LogInfo("- sycl::host_selector() : {}", + const sycl::device &device = sycl::device(sycl::cpu_selector_v); + utility::LogInfo("- sycl::cpu_selector_v : {}", SYCLDeviceToString(device)); - } catch (const sy::exception &e) { - utility::LogInfo("- sycl::host_selector() : N/A"); + } catch (const sycl::exception &e) { + utility::LogInfo("- sycl::cpu_selector_v : N/A"); } try { - const sy::device &device = sy::device(sy::cpu_selector()); - utility::LogInfo("- sycl::cpu_selector() : {}", + const sycl::device &device = sycl::device(sycl::gpu_selector_v); + utility::LogInfo("- sycl::gpu_selector_v : {}", SYCLDeviceToString(device)); - } catch (const sy::exception &e) { - utility::LogInfo("- sycl::cpu_selector() : N/A"); + } catch (const sycl::exception &e) { + utility::LogInfo("- sycl::gpu_selector_v : N/A"); } try { - const sy::device &device = sy::device(sy::gpu_selector()); - utility::LogInfo("- sycl::gpu_selector() : {}", + const sycl::device &device = + sycl::device(sycl::accelerator_selector_v); + utility::LogInfo("- sycl::accelerator_selector_v: {}", SYCLDeviceToString(device)); - } catch (const sy::exception &e) { - utility::LogInfo("- sycl::gpu_selector() : N/A"); - } - try { - const sy::device &device = sy::device(sy::accelerator_selector()); - utility::LogInfo("- sycl::accelerator_selector(): {}", - SYCLDeviceToString(device)); - } catch (const sy::exception &e) { - utility::LogInfo("- sycl::accelerator_selector(): N/A"); + } catch (const sycl::exception &e) { + utility::LogInfo("- sycl::accelerator_selector_v: N/A"); } utility::LogInfo("# Open3D SYCL device"); try { - const sy::device &device = sy::device(sy::gpu_selector()); + const sycl::device &device = sycl::device(sycl::gpu_selector_v); utility::LogInfo("- Device(\"SYCL:0\"): {}", SYCLDeviceToString(device)); - } catch (const sy::exception &e) { + } catch (const sycl::exception &e) { utility::LogInfo("- Device(\"SYCL:0\"): N/A"); } } else { utility::LogInfo("# Open3D SYCL device"); try { - const sy::device &device = sy::device(sy::gpu_selector()); + const sycl::device &device = sycl::device(sycl::gpu_selector_v); utility::LogInfo("- Device(\"SYCL:0\"): {}", SYCLDeviceToString(device)); - } catch (const sy::exception &e) { + } catch (const sycl::exception &e) { utility::LogInfo("- Device(\"SYCL:0\"): N/A"); } } @@ -227,6 +218,19 @@ std::vector GetAvailableSYCLDevices() { #endif } -} // namespace sycl +void enablePersistentJITCache() { +#ifdef BUILD_SYCL_MODULE +#if defined(_WIN32) + _putenv_s("SYCL_CACHE_PERSISTENT", "1"); +#else + setenv("SYCL_CACHE_PERSISTENT", "1", 1); +#endif +#endif + utility::LogInfo( + "enablePersistentJITCache is not compiled with " + "BUILD_SYCL_MODULE=ON."); +} + +} // namespace sy } // namespace core } // namespace open3d diff --git a/cpp/open3d/core/SYCLUtils.h b/cpp/open3d/core/SYCLUtils.h index 385523f3800..8a38ad662d0 100644 --- a/cpp/open3d/core/SYCLUtils.h +++ b/cpp/open3d/core/SYCLUtils.h @@ -19,7 +19,7 @@ namespace open3d { namespace core { -namespace sycl { +namespace sy { /// Runs simple SYCL test program for sanity checks. /// \return Retuns 0 if successful. @@ -40,6 +40,10 @@ bool IsDeviceAvailable(const Device& device); /// Return a list of available SYCL devices. std::vector GetAvailableSYCLDevices(); -} // namespace sycl +/// Enables the JIT cache for SYCL. This sets an environment variable and will +/// affect the entire process and any child processes. +void enablePersistentJITCache(); + +} // namespace sy } // namespace core } // namespace open3d diff --git a/cpp/open3d/t/geometry/CMakeLists.txt b/cpp/open3d/t/geometry/CMakeLists.txt index 1b1ba5760cf..e345c819b6f 100644 --- a/cpp/open3d/t/geometry/CMakeLists.txt +++ b/cpp/open3d/t/geometry/CMakeLists.txt @@ -7,7 +7,6 @@ target_sources(tgeometry PRIVATE LineSet.cpp BoundingVolume.cpp PointCloud.cpp - RaycastingScene.cpp RGBDImage.cpp TensorMap.cpp TriangleMesh.cpp @@ -16,6 +15,16 @@ target_sources(tgeometry PRIVATE VtkUtils.cpp ) +if (BUILD_SYCL_MODULE) + open3d_sycl_target_sources(tgeometry PRIVATE + RaycastingScene.cpp + ) +else() + target_sources(tgeometry PRIVATE + RaycastingScene.cpp + ) +endif() + open3d_show_and_abort_on_warning(tgeometry) open3d_set_global_properties(tgeometry) open3d_set_open3d_lib_properties(tgeometry) diff --git a/cpp/open3d/t/geometry/RaycastingScene.cpp b/cpp/open3d/t/geometry/RaycastingScene.cpp index 12f082d0ee1..444f4f7a5ff 100644 --- a/cpp/open3d/t/geometry/RaycastingScene.cpp +++ b/cpp/open3d/t/geometry/RaycastingScene.cpp @@ -11,11 +11,15 @@ #endif #include "open3d/t/geometry/RaycastingScene.h" +#ifdef BUILD_SYCL_MODULE +#include +#endif // This header is in the embree src dir (embree/src/ext_embree/..). #include #include #include +#include #include #include #include @@ -24,51 +28,26 @@ #include "open3d/utility/Helper.h" #include "open3d/utility/Logging.h" -namespace { - -typedef Eigen::AlignedVector3 Vec3fa; -// Dont force alignment for Vec2f because we use it just for storing -typedef Eigen::Matrix Vec2f; -typedef Eigen::Vector3f Vec3f; - -// Error function called by embree. -void ErrorFunction(void* userPtr, enum RTCError error, const char* str) { - open3d::utility::LogError("Embree error: {} {}", rtcGetErrorString(error), - str); -} +namespace callbacks { -// Checks the last dim, ensures that the number of dims is >= min_ndim, checks -// the device, and dtype. -template -void AssertTensorDtypeLastDimDeviceMinNDim(const open3d::core::Tensor& tensor, - const std::string& tensor_name, - int64_t last_dim, - const open3d::core::Device& device, - int64_t min_ndim = 2) { - open3d::core::AssertTensorDevice(tensor, device); - if (tensor.NumDims() < min_ndim) { - open3d::utility::LogError( - "{} Tensor ndim is {} but expected ndim >= {}", tensor_name, - tensor.NumDims(), min_ndim); - } - if (tensor.GetShape().back() != last_dim) { - open3d::utility::LogError( - "The last dimension of the {} Tensor must be {} but got " - "Tensor with shape {}", - tensor_name, last_dim, tensor.GetShape().ToString()); - } - open3d::core::AssertTensorDtype(tensor, - open3d::core::Dtype::FromType()); -} +struct GeomPrimID { + uint32_t geomID; + uint32_t primID; + float ray_tfar; +}; struct CountIntersectionsContext { RTCRayQueryContext context; - std::vector>* - previous_geom_prim_ID_tfar; + GeomPrimID* previous_geom_prim_ID_tfar; int* intersections; }; +#ifdef BUILD_SYCL_MODULE +RTC_SYCL_INDIRECTLY_CALLABLE void CountIntersectionsFunc( + const RTCFilterFunctionNArguments* args) { +#else void CountIntersectionsFunc(const RTCFilterFunctionNArguments* args) { +#endif int* valid = args->valid; const CountIntersectionsContext* context = reinterpret_cast(args->context); @@ -79,8 +58,8 @@ void CountIntersectionsFunc(const RTCFilterFunctionNArguments* args) { // Avoid crashing when debug visualizations are used. if (context == nullptr) return; - std::vector>* - previous_geom_prim_ID_tfar = context->previous_geom_prim_ID_tfar; + GeomPrimID* previous_geom_prim_ID_tfar = + context->previous_geom_prim_ID_tfar; int* intersections = context->intersections; // Iterate over all rays in ray packet. @@ -97,14 +76,13 @@ void CountIntersectionsFunc(const RTCFilterFunctionNArguments* args) { RTCHit hit = rtcGetHitFromHitN(hitN, N, ui); unsigned int ray_id = ray.id; - std::tuple gpID(hit.geomID, hit.primID, - ray.tfar); - auto& prev_gpIDtfar = previous_geom_prim_ID_tfar->operator[](ray_id); - if (std::get<0>(prev_gpIDtfar) != hit.geomID || - (std::get<1>(prev_gpIDtfar) != hit.primID && - std::get<2>(prev_gpIDtfar) != ray.tfar)) { + GeomPrimID gpID = {hit.geomID, hit.primID, ray.tfar}; + auto& prev_gpIDtfar = previous_geom_prim_ID_tfar[ray_id]; + if (prev_gpIDtfar.geomID != hit.geomID || + (prev_gpIDtfar.primID != hit.primID && + prev_gpIDtfar.ray_tfar != ray.tfar)) { ++(intersections[ray_id]); - previous_geom_prim_ID_tfar->operator[](ray_id) = gpID; + previous_geom_prim_ID_tfar[ray_id] = gpID; } // Always ignore hit valid[ui] = 0; @@ -113,18 +91,22 @@ void CountIntersectionsFunc(const RTCFilterFunctionNArguments* args) { struct ListIntersectionsContext { RTCRayQueryContext context; - std::vector>* - previous_geom_prim_ID_tfar; + GeomPrimID* previous_geom_prim_ID_tfar; unsigned int* ray_ids; unsigned int* geometry_ids; unsigned int* primitive_ids; float* primitive_uvs; float* t_hit; - Eigen::VectorXi cumsum; + int* cumsum; unsigned int* track_intersections; }; +#ifdef BUILD_SYCL_MODULE +RTC_SYCL_INDIRECTLY_CALLABLE void ListIntersectionsFunc( + const RTCFilterFunctionNArguments* args) { +#else void ListIntersectionsFunc(const RTCFilterFunctionNArguments* args) { +#endif int* valid = args->valid; const ListIntersectionsContext* context = reinterpret_cast(args->context); @@ -135,14 +117,14 @@ void ListIntersectionsFunc(const RTCFilterFunctionNArguments* args) { // Avoid crashing when debug visualizations are used. if (context == nullptr) return; - std::vector>* - previous_geom_prim_ID_tfar = context->previous_geom_prim_ID_tfar; + GeomPrimID* previous_geom_prim_ID_tfar = + context->previous_geom_prim_ID_tfar; unsigned int* ray_ids = context->ray_ids; unsigned int* geometry_ids = context->geometry_ids; unsigned int* primitive_ids = context->primitive_ids; float* primitive_uvs = context->primitive_uvs; float* t_hit = context->t_hit; - Eigen::VectorXi cumsum = context->cumsum; + int* cumsum = context->cumsum; unsigned int* track_intersections = context->track_intersections; // Iterate over all rays in ray packet. @@ -159,12 +141,11 @@ void ListIntersectionsFunc(const RTCFilterFunctionNArguments* args) { RTCHit hit = rtcGetHitFromHitN(hitN, N, ui); unsigned int ray_id = ray.id; - std::tuple gpID(hit.geomID, hit.primID, - ray.tfar); - auto& prev_gpIDtfar = previous_geom_prim_ID_tfar->operator[](ray_id); - if (std::get<0>(prev_gpIDtfar) != hit.geomID || - (std::get<1>(prev_gpIDtfar) != hit.primID && - std::get<2>(prev_gpIDtfar) != ray.tfar)) { + GeomPrimID gpID = {hit.geomID, hit.primID, ray.tfar}; + auto& prev_gpIDtfar = previous_geom_prim_ID_tfar[ray_id]; + if (prev_gpIDtfar.geomID != hit.geomID || + (prev_gpIDtfar.primID != hit.primID && + prev_gpIDtfar.ray_tfar != ray.tfar)) { size_t idx = cumsum[ray_id] + track_intersections[ray_id]; ray_ids[idx] = ray_id; geometry_ids[idx] = hit.geomID; @@ -172,7 +153,7 @@ void ListIntersectionsFunc(const RTCFilterFunctionNArguments* args) { primitive_uvs[idx * 2 + 0] = hit.u; primitive_uvs[idx * 2 + 1] = hit.v; t_hit[idx] = ray.tfar; - previous_geom_prim_ID_tfar->operator[](ray_id) = gpID; + previous_geom_prim_ID_tfar[ray_id] = gpID; ++(track_intersections[ray_id]); } // Always ignore hit @@ -180,16 +161,56 @@ void ListIntersectionsFunc(const RTCFilterFunctionNArguments* args) { } } +} // namespace callbacks + +namespace { + +typedef Eigen::AlignedVector3 Vec3fa; +// Dont force alignment for Vec2f because we use it just for storing +typedef Eigen::Matrix Vec2f; +typedef Eigen::Vector3f Vec3f; + +// Error function called by embree. +void ErrorFunction(void* userPtr, enum RTCError error, const char* str) { + open3d::utility::LogError("Embree error: {} {}", rtcGetErrorString(error), + str); +} + +// Checks the last dim, ensures that the number of dims is >= min_ndim, checks +// the device, and dtype. +template +void AssertTensorDtypeLastDimDeviceMinNDim(const open3d::core::Tensor& tensor, + const std::string& tensor_name, + int64_t last_dim, + const open3d::core::Device& device, + int64_t min_ndim = 2) { + open3d::core::AssertTensorDevice(tensor, device); + if (tensor.NumDims() < min_ndim) { + open3d::utility::LogError( + "{} Tensor ndim is {} but expected ndim >= {}", tensor_name, + tensor.NumDims(), min_ndim); + } + if (tensor.GetShape().back() != last_dim) { + open3d::utility::LogError( + "The last dimension of the {} Tensor must be {} but got " + "Tensor with shape {}", + tensor_name, last_dim, tensor.GetShape().ToString()); + } + open3d::core::AssertTensorDtype(tensor, + open3d::core::Dtype::FromType()); +} + // Adapted from common/math/closest_point.h -inline Vec3fa closestPointTriangle(Vec3fa const& p, - Vec3fa const& a, - Vec3fa const& b, - Vec3fa const& c, - float& tex_u, - float& tex_v) { - const Vec3fa ab = b - a; - const Vec3fa ac = c - a; - const Vec3fa ap = p - a; +template +inline Vec3faType closestPointTriangle(Vec3faType const& p, + Vec3faType const& a, + Vec3faType const& b, + Vec3faType const& c, + float& tex_u, + float& tex_v) { + const Vec3faType ab = b - a; + const Vec3faType ac = c - a; + const Vec3faType ap = p - a; const float d1 = ab.dot(ap); const float d2 = ac.dot(ap); @@ -199,7 +220,7 @@ inline Vec3fa closestPointTriangle(Vec3fa const& p, return a; } - const Vec3fa bp = p - b; + const Vec3faType bp = p - b; const float d3 = ab.dot(bp); const float d4 = ac.dot(bp); if (d3 >= 0.f && d4 <= d3) { @@ -208,7 +229,7 @@ inline Vec3fa closestPointTriangle(Vec3fa const& p, return b; } - const Vec3fa cp = p - c; + const Vec3faType cp = p - c; const float d5 = ab.dot(cp); const float d6 = ac.dot(cp); if (d6 >= 0.f && d5 <= d6) { @@ -249,56 +270,64 @@ inline Vec3fa closestPointTriangle(Vec3fa const& p, return a + v * ab + w * ac; } +struct GeometryPtr { + RTCGeometryType geom_type; + const void* ptr1; + const void* ptr2; +}; + +template struct ClosestPointResult { ClosestPointResult() - : primID(RTC_INVALID_GEOMETRY_ID), - geomID(RTC_INVALID_GEOMETRY_ID), - geometry_ptrs_ptr() {} + : primID(RTC_INVALID_GEOMETRY_ID), geomID(RTC_INVALID_GEOMETRY_ID) {} - Vec3f p; + Vec3fType p; unsigned int primID; unsigned int geomID; - Vec2f uv; - Vec3f n; - std::vector>* - geometry_ptrs_ptr; + Vec2fType uv; + Vec3fType n; + GeometryPtr* geometry_ptrs_ptr; }; // Code adapted from the embree closest_point tutorial. +template bool ClosestPointFunc(RTCPointQueryFunctionArguments* args) { assert(args->userPtr); const unsigned int geomID = args->geomID; const unsigned int primID = args->primID; // query position in world space - Vec3fa q(args->query->x, args->query->y, args->query->z); + Vec3faType q(args->query->x, args->query->y, args->query->z); - ClosestPointResult* result = - static_cast(args->userPtr); + ClosestPointResult* result = + static_cast*>( + args->userPtr); const RTCGeometryType geom_type = - std::get<0>(result->geometry_ptrs_ptr->operator[](geomID)); - const void* ptr1 = - std::get<1>(result->geometry_ptrs_ptr->operator[](geomID)); - const void* ptr2 = - std::get<2>(result->geometry_ptrs_ptr->operator[](geomID)); + result->geometry_ptrs_ptr[geomID].geom_type; + const void* ptr1 = result->geometry_ptrs_ptr[geomID].ptr1; + const void* ptr2 = result->geometry_ptrs_ptr[geomID].ptr2; if (RTC_GEOMETRY_TYPE_TRIANGLE == geom_type) { const float* vertex_positions = (const float*)ptr1; const uint32_t* triangle_indices = (const uint32_t*)ptr2; - Vec3fa v0(vertex_positions[3 * triangle_indices[3 * primID + 0] + 0], - vertex_positions[3 * triangle_indices[3 * primID + 0] + 1], - vertex_positions[3 * triangle_indices[3 * primID + 0] + 2]); - Vec3fa v1(vertex_positions[3 * triangle_indices[3 * primID + 1] + 0], - vertex_positions[3 * triangle_indices[3 * primID + 1] + 1], - vertex_positions[3 * triangle_indices[3 * primID + 1] + 2]); - Vec3fa v2(vertex_positions[3 * triangle_indices[3 * primID + 2] + 0], - vertex_positions[3 * triangle_indices[3 * primID + 2] + 1], - vertex_positions[3 * triangle_indices[3 * primID + 2] + 2]); + Vec3faType v0( + vertex_positions[3 * triangle_indices[3 * primID + 0] + 0], + vertex_positions[3 * triangle_indices[3 * primID + 0] + 1], + vertex_positions[3 * triangle_indices[3 * primID + 0] + 2]); + Vec3faType v1( + vertex_positions[3 * triangle_indices[3 * primID + 1] + 0], + vertex_positions[3 * triangle_indices[3 * primID + 1] + 1], + vertex_positions[3 * triangle_indices[3 * primID + 1] + 2]); + Vec3faType v2( + vertex_positions[3 * triangle_indices[3 * primID + 2] + 0], + vertex_positions[3 * triangle_indices[3 * primID + 2] + 1], + vertex_positions[3 * triangle_indices[3 * primID + 2] + 2]); // Determine distance to closest point on triangle float u, v; - const Vec3fa p = closestPointTriangle(q, v0, v1, v2, u, v); + const Vec3faType p = + closestPointTriangle(q, v0, v1, v2, u, v); float d = (q - p).norm(); // Store result in userPtr and update the query radius if we found a @@ -309,9 +338,9 @@ bool ClosestPointFunc(RTCPointQueryFunctionArguments* args) { result->p = p; result->primID = primID; result->geomID = geomID; - Vec3fa e1 = v1 - v0; - Vec3fa e2 = v2 - v0; - result->uv = Vec2f(u, v); + Vec3faType e1 = v1 - v0; + Vec3faType e2 = v2 - v0; + result->uv = Vec2fType(u, v); result->n = (e1.cross(e2)).normalized(); return true; // Return true to indicate that the query radius // changed. @@ -327,18 +356,17 @@ namespace t { namespace geometry { struct RaycastingScene::Impl { - // The maximum number of rays used in calls to embree. - const size_t BATCH_SIZE = 1024; - RTCDevice device_; RTCScene scene_; bool scene_committed_; // true if the scene has been committed. + RTCDevice device_; // Vector for storing some information about the added geometry. - std::vector> - geometry_ptrs_; - core::Device tensor_device_; // cpu + std::vector geometry_ptrs_; + core::Device tensor_device_; // cpu or sycl bool devprop_join_commit; + virtual ~Impl() = default; + void CommitScene() { if (!scene_committed_) { if (devprop_join_commit) { @@ -350,7 +378,100 @@ struct RaycastingScene::Impl { } } - template + virtual void CastRays(const float* const rays, + const size_t num_rays, + float* t_hit, + unsigned int* geometry_ids, + unsigned int* primitive_ids, + float* primitive_uvs, + float* primitive_normals, + const int nthreads, + const bool line_intersection) = 0; + + virtual void TestOcclusions(const float* const rays, + const size_t num_rays, + const float tnear, + const float tfar, + int8_t* occluded, + const int nthreads) = 0; + + virtual void CountIntersections(const float* const rays, + const size_t num_rays, + int* intersections, + const int nthreads) = 0; + + virtual void ListIntersections(const float* const rays, + const size_t num_rays, + const size_t num_intersections, + int* cumsum, + unsigned int* track_intersections, + unsigned int* ray_ids, + unsigned int* geometry_ids, + unsigned int* primitive_ids, + float* primitive_uvs, + float* t_hit, + const int nthreads) = 0; + + virtual void ComputeClosestPoints(const float* const query_points, + const size_t num_query_points, + float* closest_points, + unsigned int* geometry_ids, + unsigned int* primitive_ids, + float* primitive_uvs, + float* primitive_normals, + const int nthreads) = 0; + + virtual void ArraySum(int* data_ptr, + size_t num_elements, + size_t& result) = 0; + + virtual void ArrayPartialSum(int* input, + int* output, + size_t num_elements) = 0; + + virtual void CopyArray(int* src, uint32_t* dst, size_t num_elements) = 0; +}; + +#ifdef BUILD_SYCL_MODULE +struct RaycastingScene::SYCLImpl : public RaycastingScene::Impl { + // SYCL variables + sycl::queue queue_; + sycl::context context_; + sycl::device sycl_device_; + + callbacks::GeomPrimID* li_previous_geom_prim_ID_tfar = nullptr; + callbacks::GeomPrimID* ci_previous_geom_prim_ID_tfar = nullptr; + + ~SYCLImpl() { + if (li_previous_geom_prim_ID_tfar) { + sycl::free(li_previous_geom_prim_ID_tfar, queue_); + } + if (ci_previous_geom_prim_ID_tfar) { + sycl::free(ci_previous_geom_prim_ID_tfar, queue_); + } + } + + void InitializeDevice() { + try { + sycl_device_ = sycl::device(rtcSYCLDeviceSelector); + } catch (std::exception& e) { + utility::LogError("Caught exception creating sycl::device: {}", + e.what()); + return; + } + + queue_ = sycl::queue(sycl_device_); + context_ = sycl::context(sycl_device_); + + device_ = rtcNewSYCLDevice(context_, ""); + rtcSetDeviceSYCLDevice(device_, sycl_device_); + + if (!device_) { + utility::LogError("Error %d: cannot create device\n", + rtcGetDeviceError(NULL)); + } + } + void CastRays(const float* const rays, const size_t num_rays, float* t_hit, @@ -358,7 +479,361 @@ struct RaycastingScene::Impl { unsigned int* primitive_ids, float* primitive_uvs, float* primitive_normals, - const int nthreads) { + const int nthreads, + const bool line_intersection) override { + CommitScene(); + + auto scene = this->scene_; + queue_.submit([=](sycl::handler& cgh) { + cgh.parallel_for( + sycl::range<1>(num_rays), + [=](sycl::item<1> item, sycl::kernel_handler kh) { + const size_t i = item.get_id(0); + + struct RTCRayHit rh; + const float* r = &rays[i * 6]; + rh.ray.org_x = r[0]; + rh.ray.org_y = r[1]; + rh.ray.org_z = r[2]; + if (line_intersection) { + rh.ray.dir_x = r[3] - r[0]; + rh.ray.dir_y = r[4] - r[1]; + rh.ray.dir_z = r[5] - r[2]; + } else { + rh.ray.dir_x = r[3]; + rh.ray.dir_y = r[4]; + rh.ray.dir_z = r[5]; + } + rh.ray.tnear = 0; + if (line_intersection) { + rh.ray.tfar = 1.f; + } else { + rh.ray.tfar = + std::numeric_limits::infinity(); + } + rh.ray.mask = -1; + rh.ray.id = i; + rh.ray.flags = 0; + rh.hit.geomID = RTC_INVALID_GEOMETRY_ID; + rh.hit.instID[0] = RTC_INVALID_GEOMETRY_ID; + + rtcIntersect1(scene, &rh); + + t_hit[i] = rh.ray.tfar; + if (rh.hit.geomID != RTC_INVALID_GEOMETRY_ID) { + geometry_ids[i] = rh.hit.geomID; + primitive_ids[i] = rh.hit.primID; + primitive_uvs[i * 2 + 0] = rh.hit.u; + primitive_uvs[i * 2 + 1] = rh.hit.v; + float inv_norm = + 1.f / std::sqrt(rh.hit.Ng_x * rh.hit.Ng_x + + rh.hit.Ng_y * rh.hit.Ng_y + + rh.hit.Ng_z * rh.hit.Ng_z); + primitive_normals[i * 3 + 0] = + rh.hit.Ng_x * inv_norm; + primitive_normals[i * 3 + 1] = + rh.hit.Ng_y * inv_norm; + primitive_normals[i * 3 + 2] = + rh.hit.Ng_z * inv_norm; + } else { + geometry_ids[i] = RTC_INVALID_GEOMETRY_ID; + primitive_ids[i] = RTC_INVALID_GEOMETRY_ID; + primitive_uvs[i * 2 + 0] = 0; + primitive_uvs[i * 2 + 1] = 0; + primitive_normals[i * 3 + 0] = 0; + primitive_normals[i * 3 + 1] = 0; + primitive_normals[i * 3 + 2] = 0; + } + }); + }); + queue_.wait_and_throw(); + } + + void TestOcclusions(const float* const rays, + const size_t num_rays, + const float tnear, + const float tfar, + int8_t* occluded, + const int nthreads) override { + CommitScene(); + + auto scene = this->scene_; + queue_.submit([=](sycl::handler& cgh) { + cgh.parallel_for( + sycl::range<1>(num_rays), + [=](sycl::item<1> item, sycl::kernel_handler kh) { + struct RTCRayQueryContext context; + rtcInitRayQueryContext(&context); + + RTCOccludedArguments args; + rtcInitOccludedArguments(&args); + args.context = &context; + + const size_t i = item.get_id(0); + + struct RTCRay ray; + const float* r = &rays[i * 6]; + ray.org_x = r[0]; + ray.org_y = r[1]; + ray.org_z = r[2]; + ray.dir_x = r[3]; + ray.dir_y = r[4]; + ray.dir_z = r[5]; + ray.tnear = tnear; + ray.tfar = tfar; + ray.mask = -1; + ray.id = i; + ray.flags = 0; + + rtcOccluded1(scene, &ray, &args); + + occluded[i] = int8_t( + -std::numeric_limits::infinity() == + ray.tfar); + }); + }); + queue_.wait_and_throw(); + } + + void CountIntersections(const float* const rays, + const size_t num_rays, + int* intersections, + const int nthreads) override { + CommitScene(); + + queue_.memset(intersections, 0, sizeof(int) * num_rays).wait(); + + ci_previous_geom_prim_ID_tfar = + sycl::malloc_device(num_rays, queue_); + + // Check if allocation was successful + if (!ci_previous_geom_prim_ID_tfar) { + throw std::runtime_error("Failed to allocate device memory"); + } + + auto host_previous_geom_prim_ID_tfar = + std::unique_ptr>( + new callbacks::GeomPrimID[num_rays]); + for (size_t i = 0; i < num_rays; ++i) { + host_previous_geom_prim_ID_tfar[i] = { + uint32_t(RTC_INVALID_GEOMETRY_ID), + uint32_t(RTC_INVALID_GEOMETRY_ID), 0.f}; + } + + // Copy the initialized data to the device + queue_.memcpy(ci_previous_geom_prim_ID_tfar, + host_previous_geom_prim_ID_tfar.get(), + num_rays * sizeof(callbacks::GeomPrimID)) + .wait(); + + auto scene = this->scene_; + auto ci_previous_geom_prim_ID_tfar_ = ci_previous_geom_prim_ID_tfar; + queue_.submit([=](sycl::handler& cgh) { + cgh.parallel_for( + sycl::range<1>(num_rays), + [=](sycl::item<1> item, sycl::kernel_handler kh) { + callbacks::CountIntersectionsContext context; + rtcInitRayQueryContext(&context.context); + context.previous_geom_prim_ID_tfar = + ci_previous_geom_prim_ID_tfar_; + context.intersections = intersections; + + RTCIntersectArguments args; + rtcInitIntersectArguments(&args); + args.filter = callbacks::CountIntersectionsFunc; + args.context = &context.context; + + const size_t i = item.get_id(0); + + struct RTCRayHit rh; + const float* r = &rays[i * 6]; + rh.ray.org_x = r[0]; + rh.ray.org_y = r[1]; + rh.ray.org_z = r[2]; + rh.ray.dir_x = r[3]; + rh.ray.dir_y = r[4]; + rh.ray.dir_z = r[5]; + rh.ray.tnear = 0; + rh.ray.tfar = std::numeric_limits::infinity(); + rh.ray.mask = -1; + rh.ray.flags = 0; + rh.ray.id = i; + rh.hit.geomID = RTC_INVALID_GEOMETRY_ID; + rh.hit.instID[0] = RTC_INVALID_GEOMETRY_ID; + + rtcIntersect1(scene, &rh, &args); + }); + }); + queue_.wait_and_throw(); + + // Free the allocated memory + sycl::free(ci_previous_geom_prim_ID_tfar, queue_); + ci_previous_geom_prim_ID_tfar = nullptr; + } + + void ListIntersections(const float* const rays, + const size_t num_rays, + const size_t num_intersections, + int* cumsum, + unsigned int* track_intersections, + unsigned int* ray_ids, + unsigned int* geometry_ids, + unsigned int* primitive_ids, + float* primitive_uvs, + float* t_hit, + const int nthreads) override { + CommitScene(); + + queue_.memset(track_intersections, 0, sizeof(uint32_t) * num_rays) + .wait(); + queue_.memset(ray_ids, 0, sizeof(uint32_t) * num_intersections).wait(); + queue_.memset(geometry_ids, 0, sizeof(uint32_t) * num_intersections) + .wait(); + queue_.memset(primitive_ids, 0, sizeof(uint32_t) * num_intersections) + .wait(); + queue_.memset(primitive_uvs, 0, sizeof(float) * num_intersections * 2) + .wait(); + queue_.memset(t_hit, 0, sizeof(float) * num_intersections).wait(); + + li_previous_geom_prim_ID_tfar = + sycl::malloc_device(num_rays, queue_); + + // Check if allocation was successful + if (!li_previous_geom_prim_ID_tfar) { + throw std::runtime_error("Failed to allocate device memory"); + } + + auto host_previous_geom_prim_ID_tfar = + std::unique_ptr>( + new callbacks::GeomPrimID[num_rays]); + for (size_t i = 0; i < num_rays; ++i) { + host_previous_geom_prim_ID_tfar[i] = { + uint32_t(RTC_INVALID_GEOMETRY_ID), + uint32_t(RTC_INVALID_GEOMETRY_ID), 0.f}; + } + + // Copy the initialized data to the device + queue_.memcpy(li_previous_geom_prim_ID_tfar, + host_previous_geom_prim_ID_tfar.get(), + num_rays * sizeof(callbacks::GeomPrimID)) + .wait(); + + auto scene = this->scene_; + auto li_previous_geom_prim_ID_tfar_ = li_previous_geom_prim_ID_tfar; + queue_.submit([=](sycl::handler& cgh) { + cgh.parallel_for( + sycl::range<1>(num_rays), + [=](sycl::item<1> item, sycl::kernel_handler kh) { + callbacks::ListIntersectionsContext context; + rtcInitRayQueryContext(&context.context); + context.previous_geom_prim_ID_tfar = + li_previous_geom_prim_ID_tfar_; + context.ray_ids = ray_ids; + context.geometry_ids = geometry_ids; + context.primitive_ids = primitive_ids; + context.primitive_uvs = primitive_uvs; + context.t_hit = t_hit; + context.cumsum = cumsum; + context.track_intersections = track_intersections; + + RTCIntersectArguments args; + rtcInitIntersectArguments(&args); + args.filter = callbacks::ListIntersectionsFunc; + args.context = &context.context; + + const size_t i = item.get_id(0); + + struct RTCRayHit rh; + const float* r = &rays[i * 6]; + rh.ray.org_x = r[0]; + rh.ray.org_y = r[1]; + rh.ray.org_z = r[2]; + rh.ray.dir_x = r[3]; + rh.ray.dir_y = r[4]; + rh.ray.dir_z = r[5]; + rh.ray.tnear = 0; + rh.ray.tfar = std::numeric_limits::infinity(); + rh.ray.mask = -1; + rh.ray.flags = 0; + rh.ray.id = i; + rh.hit.geomID = RTC_INVALID_GEOMETRY_ID; + rh.hit.instID[0] = RTC_INVALID_GEOMETRY_ID; + + rtcIntersect1(scene, &rh, &args); + }); + }); + queue_.wait_and_throw(); + + // Free the allocated memory + sycl::free(li_previous_geom_prim_ID_tfar, queue_); + li_previous_geom_prim_ID_tfar = nullptr; + } + + void ComputeClosestPoints(const float* const query_points, + const size_t num_query_points, + float* closest_points, + unsigned int* geometry_ids, + unsigned int* primitive_ids, + float* primitive_uvs, + float* primitive_normals, + const int nthreads) override { + throw std::logic_error("Function not yet implemented"); + } + + void ArraySum(int* data_ptr, size_t num_elements, size_t& result) override { + sycl::buffer result_buf(&result, sycl::range<1>(1)); + + queue_.submit([&](sycl::handler& cgh) { + auto result_acc = + result_buf.get_access(cgh); + cgh.parallel_for( + sycl::range<1>(num_elements), + [=](sycl::item<1> item, sycl::kernel_handler kh) { + const size_t i = item.get_id(0); + sycl::atomic_ref + atomic_result_data(result_acc[0]); + atomic_result_data.fetch_add(data_ptr[i]); + }); + }); + queue_.wait_and_throw(); + } + + void ArrayPartialSum(int* input, + int* output, + size_t num_elements) override { + queue_.submit([&](sycl::handler& cgh) { + cgh.single_task([=]() { + for (size_t idx = 1; idx < num_elements; ++idx) { + output[idx] = output[idx - 1] + input[idx - 1]; + } + }); + }); + + queue_.wait_and_throw(); + } + + void CopyArray(int* src, uint32_t* dst, size_t num_elements) override { + queue_.memcpy(dst, src, num_elements * sizeof(uint32_t)).wait(); + } +}; +#endif + +struct RaycastingScene::CPUImpl : public RaycastingScene::Impl { + // The maximum number of rays used in calls to embree. + const size_t BATCH_SIZE = 1024; + + void CastRays(const float* const rays, + const size_t num_rays, + float* t_hit, + unsigned int* geometry_ids, + unsigned int* primitive_ids, + float* primitive_uvs, + float* primitive_normals, + const int nthreads, + const bool line_intersection) override { CommitScene(); auto LoopFn = [&](const tbb::blocked_range& range) { @@ -370,7 +845,7 @@ struct RaycastingScene::Impl { rh.ray.org_x = r[0]; rh.ray.org_y = r[1]; rh.ray.org_z = r[2]; - if (LINE_INTERSECTION) { + if (line_intersection) { rh.ray.dir_x = r[3] - r[0]; rh.ray.dir_y = r[4] - r[1]; rh.ray.dir_z = r[5] - r[2]; @@ -380,7 +855,7 @@ struct RaycastingScene::Impl { rh.ray.dir_z = r[5]; } rh.ray.tnear = 0; - if (LINE_INTERSECTION) { + if (line_intersection) { rh.ray.tfar = 1.f; } else { rh.ray.tfar = std::numeric_limits::infinity(); @@ -440,7 +915,7 @@ struct RaycastingScene::Impl { const float tnear, const float tfar, int8_t* occluded, - const int nthreads) { + const int nthreads) override { CommitScene(); struct RTCRayQueryContext context; @@ -495,26 +970,29 @@ struct RaycastingScene::Impl { void CountIntersections(const float* const rays, const size_t num_rays, int* intersections, - const int nthreads) { + const int nthreads) override { CommitScene(); - memset(intersections, 0, sizeof(int) * num_rays); + std::memset(intersections, 0, sizeof(int) * num_rays); - std::vector> - previous_geom_prim_ID_tfar( - num_rays, - std::make_tuple(uint32_t(RTC_INVALID_GEOMETRY_ID), - uint32_t(RTC_INVALID_GEOMETRY_ID), - 0.f)); + auto previous_geom_prim_ID_tfar = + std::unique_ptr>( + new callbacks::GeomPrimID[num_rays]); + for (size_t i = 0; i < num_rays; ++i) { + previous_geom_prim_ID_tfar[i] = {uint32_t(RTC_INVALID_GEOMETRY_ID), + uint32_t(RTC_INVALID_GEOMETRY_ID), + 0.f}; + } - CountIntersectionsContext context; + callbacks::CountIntersectionsContext context; rtcInitRayQueryContext(&context.context); - context.previous_geom_prim_ID_tfar = &previous_geom_prim_ID_tfar; + context.previous_geom_prim_ID_tfar = previous_geom_prim_ID_tfar.get(); context.intersections = intersections; RTCIntersectArguments args; rtcInitIntersectArguments(&args); - args.filter = CountIntersectionsFunc; + args.filter = callbacks::CountIntersectionsFunc; args.context = &context.context; auto LoopFn = [&](const tbb::blocked_range& range) { @@ -558,33 +1036,36 @@ struct RaycastingScene::Impl { void ListIntersections(const float* const rays, const size_t num_rays, const size_t num_intersections, - const Eigen::VectorXi& cumsum, + int* cumsum, unsigned int* track_intersections, unsigned int* ray_ids, unsigned int* geometry_ids, unsigned int* primitive_ids, float* primitive_uvs, float* t_hit, - const int nthreads) { + const int nthreads) override { CommitScene(); - memset(track_intersections, 0, sizeof(uint32_t) * num_rays); - memset(ray_ids, 0, sizeof(uint32_t) * num_intersections); - memset(geometry_ids, 0, sizeof(uint32_t) * num_intersections); - memset(primitive_ids, 0, sizeof(uint32_t) * num_intersections); - memset(primitive_uvs, 0, sizeof(float) * num_intersections * 2); - memset(t_hit, 0, sizeof(float) * num_intersections); - - std::vector> - previous_geom_prim_ID_tfar( - num_rays, - std::make_tuple(uint32_t(RTC_INVALID_GEOMETRY_ID), - uint32_t(RTC_INVALID_GEOMETRY_ID), - 0.f)); - - ListIntersectionsContext context; + std::memset(track_intersections, 0, sizeof(uint32_t) * num_rays); + std::memset(ray_ids, 0, sizeof(uint32_t) * num_intersections); + std::memset(geometry_ids, 0, sizeof(uint32_t) * num_intersections); + std::memset(primitive_ids, 0, sizeof(uint32_t) * num_intersections); + std::memset(primitive_uvs, 0, sizeof(float) * num_intersections * 2); + std::memset(t_hit, 0, sizeof(float) * num_intersections); + + auto previous_geom_prim_ID_tfar = + std::unique_ptr>( + new callbacks::GeomPrimID[num_rays]); + for (size_t i = 0; i < num_rays; ++i) { + previous_geom_prim_ID_tfar[i] = {uint32_t(RTC_INVALID_GEOMETRY_ID), + uint32_t(RTC_INVALID_GEOMETRY_ID), + 0.f}; + } + + callbacks::ListIntersectionsContext context; rtcInitRayQueryContext(&context.context); - context.previous_geom_prim_ID_tfar = &previous_geom_prim_ID_tfar; + context.previous_geom_prim_ID_tfar = previous_geom_prim_ID_tfar.get(); context.ray_ids = ray_ids; context.geometry_ids = geometry_ids; context.primitive_ids = primitive_ids; @@ -595,7 +1076,7 @@ struct RaycastingScene::Impl { RTCIntersectArguments args; rtcInitIntersectArguments(&args); - args.filter = ListIntersectionsFunc; + args.filter = callbacks::ListIntersectionsFunc; args.context = &context.context; auto LoopFn = [&](const tbb::blocked_range& range) { @@ -643,7 +1124,7 @@ struct RaycastingScene::Impl { unsigned int* primitive_ids, float* primitive_uvs, float* primitive_normals, - const int nthreads) { + const int nthreads) override { CommitScene(); auto LoopFn = [&](const tbb::blocked_range& range) { @@ -655,12 +1136,13 @@ struct RaycastingScene::Impl { query.radius = std::numeric_limits::infinity(); query.time = 0.f; - ClosestPointResult result; - result.geometry_ptrs_ptr = &geometry_ptrs_; + ClosestPointResult result; + result.geometry_ptrs_ptr = geometry_ptrs_.data(); RTCPointQueryContext instStack; rtcInitPointQueryContext(&instStack); - rtcPointQuery(scene_, &query, &instStack, &ClosestPointFunc, + rtcPointQuery(scene_, &query, &instStack, + &ClosestPointFunc, (void*)&result); closest_points[3 * i + 0] = result.p.x(); @@ -689,16 +1171,44 @@ struct RaycastingScene::Impl { LoopFn); } } + + void ArraySum(int* data_ptr, size_t num_elements, size_t& result) override { + result = std::accumulate(data_ptr, data_ptr + num_elements, result); + } + + void ArrayPartialSum(int* input, + int* output, + size_t num_elements) override { + output[0] = 0; + std::partial_sum(input, input + num_elements - 1, output + 1); + } + + void CopyArray(int* src, uint32_t* dst, size_t num_elements) override { + std::copy(src, src + num_elements, dst); + } }; -RaycastingScene::RaycastingScene(int64_t nthreads) - : impl_(new RaycastingScene::Impl()) { - if (nthreads > 0) { - std::string config("threads=" + std::to_string(nthreads)); - impl_->device_ = rtcNewDevice(config.c_str()); +RaycastingScene::RaycastingScene(int64_t nthreads, const core::Device& device) { +#ifdef BUILD_SYCL_MODULE + if (device.IsSYCL()) { + impl_ = std::make_unique(); + dynamic_cast(impl_.get()) + ->InitializeDevice(); + impl_->tensor_device_ = device; } else { - impl_->device_ = rtcNewDevice(NULL); +#endif + impl_ = std::make_unique(); + + if (nthreads > 0) { + std::string config("threads=" + std::to_string(nthreads)); + impl_->device_ = rtcNewDevice(config.c_str()); + } else { + impl_->device_ = rtcNewDevice(NULL); + } +#ifdef BUILD_SYCL_MODULE } +#endif + rtcSetDeviceErrorFunction(impl_->device_, ErrorFunction, NULL); impl_->scene_ = rtcNewScene(impl_->device_); @@ -746,13 +1256,37 @@ uint32_t RaycastingScene::AddTriangles(const core::Tensor& vertex_positions, { auto data = vertex_positions.Contiguous(); - memcpy(vertex_buffer, data.GetDataPtr(), - sizeof(float) * 3 * num_vertices); +#ifdef BUILD_SYCL_MODULE + if (impl_->tensor_device_.IsSYCL()) { + dynamic_cast(impl_.get()) + ->queue_ + .memcpy(vertex_buffer, data.GetDataPtr(), + sizeof(float) * 3 * num_vertices) + .wait(); + } else { +#endif + std::memcpy(vertex_buffer, data.GetDataPtr(), + sizeof(float) * 3 * num_vertices); +#ifdef BUILD_SYCL_MODULE + } +#endif } { auto data = triangle_indices.Contiguous(); - memcpy(index_buffer, data.GetDataPtr(), - sizeof(uint32_t) * 3 * num_triangles); +#ifdef BUILD_SYCL_MODULE + if (impl_->tensor_device_.IsSYCL()) { + dynamic_cast(impl_.get()) + ->queue_ + .memcpy(index_buffer, data.GetDataPtr(), + sizeof(uint32_t) * 3 * num_triangles) + .wait(); + } else { +#endif + std::memcpy(index_buffer, data.GetDataPtr(), + sizeof(uint32_t) * 3 * num_triangles); +#ifdef BUILD_SYCL_MODULE + } +#endif } rtcSetGeometryEnableFilterFunctionFromArguments(geom, true); rtcCommitGeometry(geom); @@ -760,9 +1294,10 @@ uint32_t RaycastingScene::AddTriangles(const core::Tensor& vertex_positions, uint32_t geom_id = rtcAttachGeometry(impl_->scene_, geom); rtcReleaseGeometry(geom); - impl_->geometry_ptrs_.push_back(std::make_tuple(RTC_GEOMETRY_TYPE_TRIANGLE, - (const void*)vertex_buffer, - (const void*)index_buffer)); + GeometryPtr geometry_ptr = {RTC_GEOMETRY_TYPE_TRIANGLE, + (const void*)vertex_buffer, + (const void*)index_buffer}; + impl_->geometry_ptrs_.push_back(geometry_ptr); return geom_id; } @@ -787,22 +1322,26 @@ std::unordered_map RaycastingScene::CastRays( size_t num_rays = shape.NumElements(); std::unordered_map result; - result["t_hit"] = core::Tensor(shape, core::Float32); - result["geometry_ids"] = core::Tensor(shape, core::UInt32); - result["primitive_ids"] = core::Tensor(shape, core::UInt32); + result["t_hit"] = core::Tensor(shape, core::Float32, rays.GetDevice()); + result["geometry_ids"] = + core::Tensor(shape, core::UInt32, rays.GetDevice()); + result["primitive_ids"] = + core::Tensor(shape, core::UInt32, rays.GetDevice()); shape.push_back(2); - result["primitive_uvs"] = core::Tensor(shape, core::Float32); + result["primitive_uvs"] = + core::Tensor(shape, core::Float32, rays.GetDevice()); shape.back() = 3; - result["primitive_normals"] = core::Tensor(shape, core::Float32); + result["primitive_normals"] = + core::Tensor(shape, core::Float32, rays.GetDevice()); auto data = rays.Contiguous(); - impl_->CastRays(data.GetDataPtr(), num_rays, - result["t_hit"].GetDataPtr(), - result["geometry_ids"].GetDataPtr(), - result["primitive_ids"].GetDataPtr(), - result["primitive_uvs"].GetDataPtr(), - result["primitive_normals"].GetDataPtr(), - nthreads); + impl_->CastRays(data.GetDataPtr(), num_rays, + result["t_hit"].GetDataPtr(), + result["geometry_ids"].GetDataPtr(), + result["primitive_ids"].GetDataPtr(), + result["primitive_uvs"].GetDataPtr(), + result["primitive_normals"].GetDataPtr(), nthreads, + false); return result; } @@ -818,7 +1357,7 @@ core::Tensor RaycastingScene::TestOcclusions(const core::Tensor& rays, // results. size_t num_rays = shape.NumElements(); - core::Tensor result(shape, core::Bool); + core::Tensor result(shape, core::Bool, rays.GetDevice()); auto data = rays.Contiguous(); impl_->TestOcclusions(data.GetDataPtr(), num_rays, tnear, tfar, @@ -837,7 +1376,8 @@ core::Tensor RaycastingScene::CountIntersections(const core::Tensor& rays, // results. size_t num_rays = shape.NumElements(); - core::Tensor intersections(shape, core::Dtype::FromType()); + core::Tensor intersections(shape, core::Dtype::FromType(), + impl_->tensor_device_); auto data = rays.Contiguous(); @@ -858,49 +1398,57 @@ RaycastingScene::ListIntersections(const core::Tensor& rays, size_t num_rays = shape.NumElements(); // determine total number of intersections - core::Tensor intersections(shape, core::Dtype::FromType()); - core::Tensor track_intersections(shape, core::Dtype::FromType()); + core::Tensor intersections(shape, core::Dtype::FromType(), + impl_->tensor_device_); + core::Tensor track_intersections(shape, core::Dtype::FromType(), + impl_->tensor_device_); auto data = rays.Contiguous(); impl_->CountIntersections(data.GetDataPtr(), num_rays, intersections.GetDataPtr(), nthreads); // prepare shape with that number of elements - Eigen::Map intersections_vector( - intersections.GetDataPtr(), num_rays); - size_t num_intersections = intersections_vector.sum(); + int* data_ptr = intersections.GetDataPtr(); + size_t num_intersections = 0; + impl_->ArraySum(data_ptr, num_rays, num_intersections); // prepare ray allocations (cumsum) - Eigen::VectorXi cumsum = Eigen::MatrixXi::Zero(num_rays, 1); - std::partial_sum(intersections_vector.begin(), - intersections_vector.end() - 1, cumsum.begin() + 1, - std::plus()); + core::Tensor cumsum_tensor_cpu = + core::Tensor::Zeros(shape, core::Dtype::FromType()); + core::Tensor cumsum_tensor = cumsum_tensor_cpu.To(impl_->tensor_device_); + int* cumsum_ptr = cumsum_tensor.GetDataPtr(); + impl_->ArrayPartialSum(data_ptr, cumsum_ptr, num_rays); // generate results structure std::unordered_map result; shape.clear(); shape.push_back(num_rays + 1); result["ray_splits"] = core::Tensor(shape, core::UInt32); + uint32_t* ptr = result["ray_splits"].GetDataPtr(); - for (int i = 0; i < cumsum.size(); ++i) { - ptr[i] = cumsum[i]; - } + impl_->CopyArray(cumsum_ptr, ptr, num_rays); + ptr[num_rays] = num_intersections; - shape[0] = intersections_vector.sum(); - result["ray_ids"] = core::Tensor(shape, core::UInt32); - result["geometry_ids"] = core::Tensor(shape, core::UInt32); - result["primitive_ids"] = core::Tensor(shape, core::UInt32); - result["t_hit"] = core::Tensor(shape, core::Float32); + shape[0] = num_intersections; + result["ray_ids"] = + core::Tensor(shape, core::UInt32, impl_->tensor_device_); + result["geometry_ids"] = + core::Tensor(shape, core::UInt32, impl_->tensor_device_); + result["primitive_ids"] = + core::Tensor(shape, core::UInt32, impl_->tensor_device_); + result["t_hit"] = core::Tensor(shape, core::Float32, impl_->tensor_device_); shape.push_back(2); - result["primitive_uvs"] = core::Tensor(shape, core::Float32); + result["primitive_uvs"] = + core::Tensor(shape, core::Float32, impl_->tensor_device_); impl_->ListIntersections(data.GetDataPtr(), num_rays, - num_intersections, cumsum, + num_intersections, cumsum_ptr, track_intersections.GetDataPtr(), result["ray_ids"].GetDataPtr(), result["geometry_ids"].GetDataPtr(), result["primitive_ids"].GetDataPtr(), result["primitive_uvs"].GetDataPtr(), result["t_hit"].GetDataPtr(), nthreads); + return result; } diff --git a/cpp/open3d/t/geometry/RaycastingScene.h b/cpp/open3d/t/geometry/RaycastingScene.h index f25d994b0b5..3a533aba96e 100644 --- a/cpp/open3d/t/geometry/RaycastingScene.h +++ b/cpp/open3d/t/geometry/RaycastingScene.h @@ -30,14 +30,16 @@ namespace geometry { class RaycastingScene { public: /// \brief Default Constructor. - RaycastingScene(int64_t nthreads = 0); + RaycastingScene(int64_t nthreads = 0, + const core::Device &device = core::Device("CPU:0")); ~RaycastingScene(); /// \brief Add a triangle mesh to the scene. /// \param vertex_positions Vertices as Tensor of dim {N,3} and dtype float. /// \param triangle_indices Triangles as Tensor of dim {M,3} and dtype - /// uint32_t. \return The geometry ID of the added mesh. + /// uint32_t. + /// \return The geometry ID of the added mesh. uint32_t AddTriangles(const core::Tensor &vertex_positions, const core::Tensor &triangle_indices); @@ -250,6 +252,10 @@ class RaycastingScene { private: struct Impl; + struct CPUImpl; +#ifdef BUILD_SYCL_MODULE + struct SYCLImpl; +#endif std::unique_ptr impl_; }; diff --git a/cpp/pybind/CMakeLists.txt b/cpp/pybind/CMakeLists.txt index c79bbd96719..6efae9a17fd 100644 --- a/cpp/pybind/CMakeLists.txt +++ b/cpp/pybind/CMakeLists.txt @@ -106,7 +106,7 @@ endif() # Include additional libraries that may be absent from the user system # eg: libc++.so, libc++abi.so (needed by filament) for Linux. # libc++.so is a linker script including libc++.so.1 and libc++abi.so, so append 1 to libc++.so -set(PYTHON_EXTRA_LIBRARIES $) +set(PYTHON_EXTRA_LIBRARIES $) if (BUILD_GUI AND CMAKE_SYSTEM_NAME STREQUAL "Linux") list(APPEND PYTHON_EXTRA_LIBRARIES ${CPP_LIBRARY}.1 ${CPPABI_LIBRARY}) endif() diff --git a/cpp/pybind/core/sycl_utils.cpp b/cpp/pybind/core/sycl_utils.cpp index 0acf8c37a5b..e50d82b8f53 100644 --- a/cpp/pybind/core/sycl_utils.cpp +++ b/cpp/pybind/core/sycl_utils.cpp @@ -13,7 +13,20 @@ namespace open3d { namespace core { void pybind_sycl_utils_definitions(py::module& m) { - m.def("sycl_demo", &sycl::SYCLDemo); + m.def("sycl_demo", &sy::SYCLDemo); + + py::module m_sycl = m.def_submodule("sycl"); + m_sycl.def("is_available", sy::IsAvailable, + "Returns true if Open3D is compiled with SYCL support and at " + "least one compatible SYCL device is detected."); + + m_sycl.def("get_available_devices", sy::GetAvailableSYCLDevices, + "Return a list of available SYCL devices."); + + m_sycl.def("enable_persistent_jit_cache", sy::enablePersistentJITCache, + "Enables the JIT cache for SYCL. This sets an environment " + "variable and " + "will affect the entire process and any child processes."); } } // namespace core diff --git a/cpp/pybind/t/geometry/raycasting_scene.cpp b/cpp/pybind/t/geometry/raycasting_scene.cpp index b3e8983aaef..b611dfcfbb0 100644 --- a/cpp/pybind/t/geometry/raycasting_scene.cpp +++ b/cpp/pybind/t/geometry/raycasting_scene.cpp @@ -22,7 +22,7 @@ or compute the closest point on the surface of a mesh with respect to one or more query points. It builds an internal acceleration structure to speed up those queries. -This class supports only the CPU device. +This class supports the CPU device and SYCL GPU device. The following shows how to create a scene and compute ray intersections:: @@ -58,11 +58,13 @@ void pybind_raycasting_scene_definitions(py::module& m) { auto raycasting_scene = static_cast>(m.attr("RaycastingScene")); // Constructors. - raycasting_scene.def(py::init(), "nthreads"_a = 0, R"doc( + raycasting_scene.def(py::init(), "nthreads"_a = 0, + "device"_a = core::Device("CPU:0"), R"doc( Create a RaycastingScene. Args: nthreads (int): The number of threads to use for building the scene. Set to 0 for automatic. + device (open3d.core.Device): The device to use. Currently CPU and SYCL devices are supported. )doc"); raycasting_scene.def( diff --git a/cpp/tests/core/SYCLUtils.cpp b/cpp/tests/core/SYCLUtils.cpp index d10d99f09fa..e4462f53be9 100644 --- a/cpp/tests/core/SYCLUtils.cpp +++ b/cpp/tests/core/SYCLUtils.cpp @@ -18,18 +18,18 @@ namespace open3d { namespace tests { -TEST(SYCLUtils, SYCLDemo) { core::sycl::SYCLDemo(); } +TEST(SYCLUtils, SYCLDemo) { core::sy::SYCLDemo(); } TEST(SYCLUtils, PrintAllSYCLDevices) { - core::sycl::PrintSYCLDevices(/*print_all=*/true); + core::sy::PrintSYCLDevices(/*print_all=*/true); } TEST(SYCLUtils, PrintSYCLDevices) { - core::sycl::PrintSYCLDevices(/*print_all=*/false); + core::sy::PrintSYCLDevices(/*print_all=*/false); } TEST(SYCLUtils, SYCLUnifiedSharedMemory) { - if (!core::sycl::IsAvailable()) { + if (!core::sy::IsAvailable()) { return; } diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci old mode 100644 new mode 100755 index 73feb7bffd6..ac98e50236f --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -89,7 +89,7 @@ RUN apt-get update && apt-get install -y \ liblzma-dev \ && rm -rf /var/lib/apt/lists/* -# pyenv or Intel Python +# pyenv # The pyenv python paths are used during docker run, in this way docker run # does not need to activate the environment again. # The soft link from the python patch level version to the python mino version @@ -97,15 +97,12 @@ RUN apt-get update && apt-get install -y \ # which patch level pyenv will install (latest). ENV PYENV_ROOT=/root/.pyenv ENV PATH="$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PYENV_ROOT/versions/$PYTHON_VERSION/bin:$PATH" -ENV PATH="/opt/intel/oneapi/intelpython/latest/bin:${PATH}" -RUN if [ "${BUILD_SYCL_MODULE}" = "OFF" ]; then \ - curl https://pyenv.run | bash \ +RUN curl https://pyenv.run | bash \ && pyenv update \ && pyenv install $PYTHON_VERSION \ && pyenv global $PYTHON_VERSION \ && pyenv rehash \ - && ln -s $PYENV_ROOT/versions/${PYTHON_VERSION}* $PYENV_ROOT/versions/${PYTHON_VERSION}; \ - fi + && ln -s $PYENV_ROOT/versions/${PYTHON_VERSION}* $PYENV_ROOT/versions/${PYTHON_VERSION}; RUN python --version && pip --version SHELL ["/bin/bash", "-o", "pipefail", "-c"] @@ -191,9 +188,11 @@ RUN \ export CMAKE_CXX_COMPILER=icpx; \ export CMAKE_C_COMPILER=icx; \ export GLIBCXX_USE_CXX11_ABI=ON; \ + export BUILD_ISPC_MODULE=OFF; \ else \ export CMAKE_CXX_COMPILER=g++; \ export CMAKE_C_COMPILER=gcc; \ + export BUILD_ISPC_MODULE=ON; \ # TODO: PyTorch still use old CXX ABI, remove this line when PyTorch is updated if [ "$BUILD_PYTORCH_OPS" = "ON" ]; then \ export GLIBCXX_USE_CXX11_ABI=OFF; \ @@ -208,11 +207,12 @@ RUN \ -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} \ -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} \ -DBUILD_SYCL_MODULE=${BUILD_SYCL_MODULE} \ + -DBUILD_ISPC_MODULE=${BUILD_ISPC_MODULE} \ -DDEVELOPER_BUILD=${DEVELOPER_BUILD} \ -DBUILD_LIBREALSENSE=ON \ -DBUILD_CUDA_MODULE=${BUILD_CUDA_MODULE} \ -DBUILD_COMMON_CUDA_ARCHS=ON \ - -DBUILD_COMMON_ISPC_ISAS=ON \ + -DBUILD_COMMON_ISPC_ISAS=${BUILD_ISPC_MODULE} \ -DGLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI} \ -DBUILD_TENSORFLOW_OPS=${BUILD_TENSORFLOW_OPS} \ -DBUILD_PYTORCH_OPS=${BUILD_PYTORCH_OPS} \ diff --git a/docker/docker_build.sh b/docker/docker_build.sh index c92aed26829..56cc75fb030 100755 --- a/docker/docker_build.sh +++ b/docker/docker_build.sh @@ -82,7 +82,7 @@ HOST_OPEN3D_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")"/.. >/dev/null 2>&1 && pw # Shared variables CCACHE_VERSION=4.3 -CMAKE_VERSION=cmake-3.24.4-linux-x86_64 +CMAKE_VERSION=cmake-3.29.2-linux-x86_64 CMAKE_VERSION_AARCH64=cmake-3.24.4-linux-aarch64 CUDA_VERSION=12.1.0-cudnn8 CUDA_VERSION_LATEST=12.1.0-cudnn8 @@ -447,8 +447,8 @@ sycl-shared_export_env() { export DOCKER_TAG=open3d-ci:sycl-shared # https://hub.docker.com/r/intel/oneapi-basekit - # https://github.com/intel/oneapi-containers/blob/main/images/docker/basekit/Dockerfile.ubuntu-20.04 - export BASE_IMAGE=intel/oneapi-basekit:2022.2-devel-ubuntu20.04 + # https://github.com/intel/oneapi-containers/blob/master/images/docker/basekit/Dockerfile.ubuntu-20.04 + export BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04 export DEVELOPER_BUILD=ON export CCACHE_TAR_NAME=open3d-ci-sycl export PYTHON_VERSION=3.8 @@ -464,8 +464,8 @@ sycl-static_export_env() { export DOCKER_TAG=open3d-ci:sycl-static # https://hub.docker.com/r/intel/oneapi-basekit - # https://github.com/intel/oneapi-containers/blob/main/images/docker/basekit/Dockerfile.ubuntu-20.04 - export BASE_IMAGE=intel/oneapi-basekit:2022.2-devel-ubuntu20.04 + # https://github.com/intel/oneapi-containers/blob/master/images/docker/basekit/Dockerfile.ubuntu-20.04 + export BASE_IMAGE=intel/oneapi-basekit:2024.1.0-devel-ubuntu20.04 export DEVELOPER_BUILD=ON export CCACHE_TAR_NAME=open3d-ci-sycl export PYTHON_VERSION=3.8 diff --git a/examples/cpp/SYCLDemo.cpp b/examples/cpp/SYCLDemo.cpp index 2fcd8c55dd1..d72db5ea250 100644 --- a/examples/cpp/SYCLDemo.cpp +++ b/examples/cpp/SYCLDemo.cpp @@ -8,6 +8,6 @@ #include "open3d/core/SYCLUtils.h" int main() { - open3d::core::sycl::SYCLDemo(); + open3d::core::sy::SYCLDemo(); return 0; } diff --git a/python/test/open3d_test.py b/python/test/open3d_test.py old mode 100644 new mode 100755 index 46a768746d0..e6997f7a418 --- a/python/test/open3d_test.py +++ b/python/test/open3d_test.py @@ -5,14 +5,6 @@ # SPDX-License-Identifier: MIT # ---------------------------------------------------------------------------- -import os -import sys -import urllib.request -import zipfile - -import numpy as np -import pytest - def torch_available(): try: @@ -23,20 +15,21 @@ def torch_available(): return True -def list_devices(): +def list_devices(enable_cuda=True, enable_sycl=False): """ - If Open3D is built with CUDA support: - - If cuda device is available, returns [Device("CPU:0"), Device("CUDA:0")]. - - If cuda device is not available, returns [Device("CPU:0")]. - - If Open3D is built without CUDA support: - - returns [Device("CPU:0")]. + Returns a list of devices that are available for Open3D to use: + - Device("CPU:0") + - Device("CUDA:0") if built with CUDA support and a CUDA device is available. + - Device("SYCL:0") if built with SYCL support and a SYCL device is available. """ import open3d as o3d - if o3d.core.cuda.device_count() > 0: - return [o3d.core.Device("CPU:0"), o3d.core.Device("CUDA:0")] - else: - return [o3d.core.Device("CPU:0")] + + devices = [o3d.core.Device("CPU:0")] + if enable_cuda and o3d.core.cuda.device_count() > 0: + devices.append(o3d.core.Device("CUDA:0")) + if enable_sycl and o3d.core.sycl.is_available(): + return devices.append(o3d.core.Device("SYCL:0")) + return devices def list_devices_with_torch(): diff --git a/python/test/t/geometry/test_raycasting_scene.py b/python/test/t/geometry/test_raycasting_scene.py old mode 100644 new mode 100755 index 3ce024a2b29..3fcb9ea2d28 --- a/python/test/t/geometry/test_raycasting_scene.py +++ b/python/test/t/geometry/test_raycasting_scene.py @@ -9,57 +9,86 @@ import numpy as np import pytest +import sys +import os + +sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../..") +from open3d_test import list_devices + # test intersection with a single triangle -def test_cast_rays(): +@pytest.mark.parametrize("device", + list_devices(enable_cuda=False, enable_sycl=True)) +def test_cast_rays(device): vertices = o3d.core.Tensor([[0, 0, 0], [1, 0, 0], [1, 1, 0]], - dtype=o3d.core.float32) - triangles = o3d.core.Tensor([[0, 1, 2]], dtype=o3d.core.uint32) + dtype=o3d.core.float32, + device=device) + triangles = o3d.core.Tensor([[0, 1, 2]], + dtype=o3d.core.uint32, + device=device) - scene = o3d.t.geometry.RaycastingScene() + scene = o3d.t.geometry.RaycastingScene(device=device) geom_id = scene.add_triangles(vertices, triangles) - rays = o3d.core.Tensor([[0.2, 0.1, 1, 0, 0, -1], [10, 10, 10, 1, 0, 0]], - dtype=o3d.core.float32) + rays = o3d.core.Tensor( + [[0.2, 0.1, 1, 0, 0, -1], [10, 10, 10, 1, 0, 0]], + dtype=o3d.core.float32, + device=device, + ) ans = scene.cast_rays(rays) # first ray hits the triangle - assert geom_id == ans['geometry_ids'][0] - assert np.isclose(ans['t_hit'][0].item(), 1.0) + assert geom_id == ans["geometry_ids"][0].cpu() + assert np.isclose(ans["t_hit"][0].item(), 1.0) # second ray misses - assert o3d.t.geometry.RaycastingScene.INVALID_ID == ans['geometry_ids'][1] - assert np.isinf(ans['t_hit'][1].item()) + assert o3d.t.geometry.RaycastingScene.INVALID_ID == ans["geometry_ids"][ + 1].cpu() + assert np.isinf(ans["t_hit"][1].item()) # cast lots of random rays to test the internal batching # we expect no errors for this test -def test_cast_lots_of_rays(): +@pytest.mark.parametrize("device", + list_devices(enable_cuda=False, enable_sycl=True)) +def test_cast_lots_of_rays(device): vertices = o3d.core.Tensor([[0, 0, 0], [1, 0, 0], [1, 1, 0]], - dtype=o3d.core.float32) - triangles = o3d.core.Tensor([[0, 1, 2]], dtype=o3d.core.uint32) + dtype=o3d.core.float32, + device=device) + triangles = o3d.core.Tensor([[0, 1, 2]], + dtype=o3d.core.uint32, + device=device) - scene = o3d.t.geometry.RaycastingScene() + scene = o3d.t.geometry.RaycastingScene(device=device) scene.add_triangles(vertices, triangles) rs = np.random.RandomState(123) rays = o3d.core.Tensor.from_numpy(rs.rand(7654321, 6).astype(np.float32)) + rays = rays.to(device) _ = scene.cast_rays(rays) # test occlusion with a single triangle -def test_test_occlusions(): +@pytest.mark.parametrize("device", + list_devices(enable_cuda=False, enable_sycl=True)) +def test_test_occlusions(device): vertices = o3d.core.Tensor([[0, 0, 0], [1, 0, 0], [1, 1, 0]], - dtype=o3d.core.float32) - triangles = o3d.core.Tensor([[0, 1, 2]], dtype=o3d.core.uint32) + dtype=o3d.core.float32, + device=device) + triangles = o3d.core.Tensor([[0, 1, 2]], + dtype=o3d.core.uint32, + device=device) - scene = o3d.t.geometry.RaycastingScene() + scene = o3d.t.geometry.RaycastingScene(device=device) scene.add_triangles(vertices, triangles) - rays = o3d.core.Tensor([[0.2, 0.1, 1, 0, 0, -1], [10, 10, 10, 1, 0, 0]], - dtype=o3d.core.float32) - ans = scene.test_occlusions(rays) + rays = o3d.core.Tensor( + [[0.2, 0.1, 1, 0, 0, -1], [10, 10, 10, 1, 0, 0]], + dtype=o3d.core.float32, + device=device, + ) + ans = scene.test_occlusions(rays).cpu() # first ray is occluded by the triangle assert ans[0] == True @@ -68,88 +97,124 @@ def test_test_occlusions(): assert ans[1] == False # set tfar such that no ray is occluded - ans = scene.test_occlusions(rays, tfar=0.5) + ans = scene.test_occlusions(rays, tfar=0.5).cpu() assert ans.any() == False # set tnear such that no ray is occluded - ans = scene.test_occlusions(rays, tnear=1.5) + ans = scene.test_occlusions(rays, tnear=1.5).cpu() assert ans.any() == False # test lots of random rays for occlusions to test the internal batching # we expect no errors for this test -def test_test_lots_of_occlusions(): +@pytest.mark.parametrize("device", + list_devices(enable_cuda=False, enable_sycl=True)) +def test_test_lots_of_occlusions(device): vertices = o3d.core.Tensor([[0, 0, 0], [1, 0, 0], [1, 1, 0]], - dtype=o3d.core.float32) - triangles = o3d.core.Tensor([[0, 1, 2]], dtype=o3d.core.uint32) + dtype=o3d.core.float32, + device=device) + triangles = o3d.core.Tensor([[0, 1, 2]], + dtype=o3d.core.uint32, + device=device) - scene = o3d.t.geometry.RaycastingScene() + scene = o3d.t.geometry.RaycastingScene(device=device) scene.add_triangles(vertices, triangles) rs = np.random.RandomState(123) rays = o3d.core.Tensor.from_numpy(rs.rand(7654321, 6).astype(np.float32)) + rays = rays.to(device) _ = scene.test_occlusions(rays) -def test_add_triangle_mesh(): - cube = o3d.t.geometry.TriangleMesh.from_legacy( - o3d.geometry.TriangleMesh.create_box()) +@pytest.mark.parametrize("device", + list_devices(enable_cuda=False, enable_sycl=True)) +def test_add_triangle_mesh(device): + cube = o3d.t.geometry.TriangleMesh.create_box() + cube = cube.to(device) - scene = o3d.t.geometry.RaycastingScene() + scene = o3d.t.geometry.RaycastingScene(device=device) scene.add_triangles(cube) - rays = o3d.core.Tensor([[0.5, 0.5, -1, 0, 0, 1], [0.5, 0.5, 0.5, 0, 0, 1], - [10, 10, 10, 1, 0, 0]], - dtype=o3d.core.float32) + rays = o3d.core.Tensor( + [[0.5, 0.5, -1, 0, 0, 1], [0.5, 0.5, 0.5, 0, 0, 1], + [10, 10, 10, 1, 0, 0]], + dtype=o3d.core.float32, + device=device, + ) ans = scene.count_intersections(rays) - np.testing.assert_equal(ans.numpy(), [2, 1, 0]) + np.testing.assert_equal(ans.cpu().numpy(), [2, 1, 0]) -def test_count_intersections(): - cube = o3d.t.geometry.TriangleMesh.from_legacy( - o3d.geometry.TriangleMesh.create_box()) +@pytest.mark.parametrize("device", + list_devices(enable_cuda=False, enable_sycl=True)) +def test_count_intersections(device): + cube = o3d.t.geometry.TriangleMesh.create_box() + vertex_positions = cube.vertex.positions + vertex_positions = vertex_positions.to(device) + triangle_indices = cube.triangle.indices + triangle_indices = triangle_indices.to(o3d.core.Dtype.UInt32) + triangle_indices = triangle_indices.to(device) - scene = o3d.t.geometry.RaycastingScene() - scene.add_triangles(cube) + scene = o3d.t.geometry.RaycastingScene(device=device) + scene.add_triangles(vertex_positions, triangle_indices) - rays = o3d.core.Tensor([[0.5, 0.5, -1, 0, 0, 1], [0.5, 0.5, 0.5, 0, 0, 1], - [10, 10, 10, 1, 0, 0]], - dtype=o3d.core.float32) + rays = o3d.core.Tensor( + [[0.5, 0.5, -1, 0, 0, 1], [0.5, 0.5, 0.5, 0, 0, 1], + [10, 10, 10, 1, 0, 0]], + dtype=o3d.core.float32, + device=device, + ) ans = scene.count_intersections(rays) - np.testing.assert_equal(ans.numpy(), [2, 1, 0]) + np.testing.assert_equal(ans.cpu().numpy(), [2, 1, 0]) # count lots of random ray intersections to test the internal batching # we expect no errors for this test -def test_count_lots_of_intersections(): - cube = o3d.t.geometry.TriangleMesh.from_legacy( - o3d.geometry.TriangleMesh.create_box()) - - scene = o3d.t.geometry.RaycastingScene() - scene.add_triangles(cube) +@pytest.mark.parametrize("device", + list_devices(enable_cuda=False, enable_sycl=True)) +def test_count_lots_of_intersections(device): + cube = o3d.t.geometry.TriangleMesh.create_box() + vertex_positions = cube.vertex.positions + vertex_positions = vertex_positions.to(device) + triangle_indices = cube.triangle.indices + triangle_indices = triangle_indices.to(o3d.core.Dtype.UInt32) + triangle_indices = triangle_indices.to(device) + + scene = o3d.t.geometry.RaycastingScene(device=device) + scene.add_triangles(vertex_positions, triangle_indices) rs = np.random.RandomState(123) rays = o3d.core.Tensor.from_numpy(rs.rand(1234567, 6).astype(np.float32)) + rays = rays.to(device) _ = scene.count_intersections(rays) -def test_list_intersections(): - cube = o3d.t.geometry.TriangleMesh.from_legacy( - o3d.geometry.TriangleMesh.create_box()) - - scene = o3d.t.geometry.RaycastingScene() - scene.add_triangles(cube) - - rays = o3d.core.Tensor([[0.5, 0.5, -1, 0, 0, 1], [0.5, 0.5, 0.5, 0, 0, 1], - [10, 10, 10, 1, 0, 0]], - dtype=o3d.core.float32) +@pytest.mark.parametrize("device", + list_devices(enable_cuda=False, enable_sycl=True)) +def test_list_intersections(device): + cube = o3d.t.geometry.TriangleMesh.create_box() + vertex_positions = cube.vertex.positions + vertex_positions = vertex_positions.to(device) + triangle_indices = cube.triangle.indices + triangle_indices = triangle_indices.to(o3d.core.Dtype.UInt32) + triangle_indices = triangle_indices.to(device) + + scene = o3d.t.geometry.RaycastingScene(device=device) + scene.add_triangles(vertex_positions, triangle_indices) + + rays = o3d.core.Tensor( + [[0.5, 0.5, -1, 0, 0, 1], [0.5, 0.5, 0.5, 0, 0, 1], + [10, 10, 10, 1, 0, 0]], + dtype=o3d.core.float32, + device=device, + ) ans = scene.list_intersections(rays) - np.testing.assert_allclose(ans['t_hit'].numpy(), + np.testing.assert_allclose(ans["t_hit"].cpu().numpy(), np.array([1.0, 2.0, 0.5]), rtol=1e-6, atol=1e-6) @@ -157,15 +222,22 @@ def test_list_intersections(): # list lots of random ray intersections to test the internal batching # we expect no errors for this test -def test_list_lots_of_intersections(): - cube = o3d.t.geometry.TriangleMesh.from_legacy( - o3d.geometry.TriangleMesh.create_box()) - - scene = o3d.t.geometry.RaycastingScene() - scene.add_triangles(cube) +@pytest.mark.parametrize("device", + list_devices(enable_cuda=False, enable_sycl=True)) +def test_list_lots_of_intersections(device): + cube = o3d.t.geometry.TriangleMesh.create_box() + vertex_positions = cube.vertex.positions + vertex_positions = vertex_positions.to(device) + triangle_indices = cube.triangle.indices + triangle_indices = triangle_indices.to(o3d.core.Dtype.UInt32) + triangle_indices = triangle_indices.to(device) + + scene = o3d.t.geometry.RaycastingScene(device=device) + scene.add_triangles(vertex_positions, triangle_indices) rs = np.random.RandomState(123) rays = o3d.core.Tensor.from_numpy(rs.rand(123456, 6).astype(np.float32)) + rays = rays.to(device) _ = scene.list_intersections(rays) @@ -182,12 +254,14 @@ def test_compute_closest_points(): dtype=o3d.core.float32) ans = scene.compute_closest_points(query_points) - assert (geom_id == ans['geometry_ids']).all() - assert (0 == ans['primitive_ids']).all() - np.testing.assert_allclose(ans['points'].numpy(), - np.array([[0.2, 0.1, 0.0], [1, 1, 0]]), - rtol=1e-6, - atol=1e-6) + assert (geom_id == ans["geometry_ids"]).all() + assert (0 == ans["primitive_ids"]).all() + np.testing.assert_allclose( + ans["points"].numpy(), + np.array([[0.2, 0.1, 0.0], [1, 1, 0]]), + rtol=1e-6, + atol=1e-6, + ) # compute lots of closest points to test the internal batching @@ -207,8 +281,7 @@ def test_compute_lots_of_closest_points(): def test_compute_distance(): - cube = o3d.t.geometry.TriangleMesh.from_legacy( - o3d.geometry.TriangleMesh.create_box()) + cube = o3d.t.geometry.TriangleMesh.create_box() scene = o3d.t.geometry.RaycastingScene() scene.add_triangles(cube) @@ -221,8 +294,7 @@ def test_compute_distance(): def test_compute_signed_distance(): - cube = o3d.t.geometry.TriangleMesh.from_legacy( - o3d.geometry.TriangleMesh.create_box()) + cube = o3d.t.geometry.TriangleMesh.create_box() scene = o3d.t.geometry.RaycastingScene() scene.add_triangles(cube) @@ -235,8 +307,7 @@ def test_compute_signed_distance(): def test_compute_occupancy(): - cube = o3d.t.geometry.TriangleMesh.from_legacy( - o3d.geometry.TriangleMesh.create_box()) + cube = o3d.t.geometry.TriangleMesh.create_box() scene = o3d.t.geometry.RaycastingScene() scene.add_triangles(cube) @@ -276,53 +347,49 @@ def test_output_shapes(shape): # some outputs append a specific last dim last_dim = { - 't_hit': [], - 'geometry_ids': [], - 'primitive_ids': [], - 'primitive_uvs': [2], - 'primitive_normals': [3], - 'points': [3], - 'ray_ids': [], - 'ray_splits': [] + "t_hit": [], + "geometry_ids": [], + "primitive_ids": [], + "primitive_uvs": [2], + "primitive_normals": [3], + "points": [3], + "ray_ids": [], + "ray_splits": [], } ans = scene.cast_rays(rays) for k, v in ans.items(): expected_shape = shape + last_dim[k] - assert list( - v.shape - ) == expected_shape, 'shape mismatch: expected {} but got {} for {}'.format( - expected_shape, list(v.shape), k) + assert (list(v.shape) == expected_shape + ), "shape mismatch: expected {} but got {} for {}".format( + expected_shape, list(v.shape), k) ans = scene.compute_closest_points(query_points) for k, v in ans.items(): expected_shape = shape + last_dim[k] - assert list( - v.shape - ) == expected_shape, 'shape mismatch: expected {} but got {} for {}'.format( - expected_shape, list(v.shape), k) + assert (list(v.shape) == expected_shape + ), "shape mismatch: expected {} but got {} for {}".format( + expected_shape, list(v.shape), k) ans = scene.list_intersections(rays) nx = np.sum(scene.count_intersections(rays).numpy()).tolist() for k, v in ans.items(): - if k == 'ray_splits': + if k == "ray_splits": alt_shape = [np.prod(rays.shape[:-1]) + 1] else: alt_shape = [nx] - #use np.append otherwise issues if alt_shape = [0] and last_dim[k] = [] + # use np.append otherwise issues if alt_shape = [0] and last_dim[k] = [] expected_shape = np.append(alt_shape, last_dim[k]).tolist() - assert list( - v.shape - ) == expected_shape, 'shape mismatch: expected {} but got {} for {}'.format( - expected_shape, list(v.shape), k) + assert (list(v.shape) == expected_shape + ), "shape mismatch: expected {} but got {} for {}".format( + expected_shape, list(v.shape), k) def test_sphere_wrong_occupancy(): # This test checks a specific scenario where the old implementation # without ray jitter produced wrong results for a sphere because some # rays miss hitting exactly a vertex or an edge. - mesh = o3d.geometry.TriangleMesh.create_sphere(0.8) - mesh = o3d.t.geometry.TriangleMesh.from_legacy(mesh) + mesh = o3d.t.geometry.TriangleMesh.create_sphere(0.8) scene = o3d.t.geometry.RaycastingScene() scene.add_triangles(mesh)