[lang] [refactor] Use preallocated memory via device allocation for N…

…darray (#3395) * Use preallocated memory from runtime * Auto Format * Add cuda guard * Unify the memory runtime function * Auto Format * Add a helper for cpu and cuda mem alloc * Auto Format Co-authored-by: Taichi Gardener <taichigardener@gmail.com>
taichi-dev · Nov 9, 2021 · c7c97e2 · c7c97e2
1 parent ceb61f3
commit c7c97e2
Show file tree

Hide file tree

Showing 9 changed files with 107 additions and 12 deletions.
diff --git a/taichi/backends/cpu/cpu_device.cpp b/taichi/backends/cpu/cpu_device.cpp
@@ -26,6 +26,23 @@ DeviceAllocation CpuDevice::allocate_memory(const AllocParams &params) {
   return alloc;
 }
 
+DeviceAllocation CpuDevice::allocate_memory_runtime(const AllocParams &params,
+                                                    JITModule *runtime_jit,
+                                                    LLVMRuntime *runtime,
+                                                    uint64 *result_buffer) {
+  AllocInfo info;
+  info.ptr = allocate_llvm_runtime_memory_jit(runtime_jit, runtime, params.size,
+                                              result_buffer);
+  info.size = params.size;
+
+  DeviceAllocation alloc;
+  alloc.alloc_id = allocations_.size();
+  alloc.device = this;
+
+  allocations_.push_back(info);
+  return alloc;
+}
+
 void CpuDevice::dealloc_memory(DeviceAllocation handle) {
   validate_device_alloc(handle);
   AllocInfo &info = allocations_[handle.alloc_id];
@@ -50,6 +67,11 @@ DeviceAllocation CpuDevice::import_memory(void *ptr, size_t size) {
   return alloc;
 }
 
+uint64 CpuDevice::fetch_result_uint64(int i, uint64 *result_buffer) {
+  uint64 ret = result_buffer[i];
+  return ret;
+}
+
 }  // namespace cpu
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/cpu/cpu_device.h b/taichi/backends/cpu/cpu_device.h
@@ -87,12 +87,18 @@ class CpuDevice : public Device {
   ~CpuDevice() override{};
 
   DeviceAllocation allocate_memory(const AllocParams &params) override;
+  DeviceAllocation allocate_memory_runtime(const AllocParams &params,
+                                           JITModule *runtime_jit_module,
+                                           LLVMRuntime *runtime,
+                                           uint64 *result_buffer) override;
   void dealloc_memory(DeviceAllocation handle) override;
 
   std::unique_ptr<Pipeline> create_pipeline(
       const PipelineSourceDesc &src,
       std::string name = "Pipeline") override{TI_NOT_IMPLEMENTED};
 
+  uint64 fetch_result_uint64(int i, uint64 *result_buffer) override;
+
   void *map_range(DevicePtr ptr, uint64_t size) override{TI_NOT_IMPLEMENTED};
   void *map(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED};
 

diff --git a/taichi/backends/cuda/cuda_device.cpp b/taichi/backends/cuda/cuda_device.cpp
@@ -31,6 +31,28 @@ DeviceAllocation CudaDevice::allocate_memory(const AllocParams &params) {
   return alloc;
 }
 
+DeviceAllocation CudaDevice::allocate_memory_runtime(const AllocParams &params,
+                                                     JITModule *runtime_jit,
+                                                     LLVMRuntime *runtime,
+                                                     uint64 *result_buffer) {
+  AllocInfo info;
+  if (params.host_read || params.host_write) {
+    TI_NOT_IMPLEMENTED
+  } else {
+    info.ptr = allocate_llvm_runtime_memory_jit(runtime_jit, runtime,
+                                                params.size, result_buffer);
+  }
+  info.size = params.size;
+  info.is_imported = false;
+
+  DeviceAllocation alloc;
+  alloc.alloc_id = allocations_.size();
+  alloc.device = this;
+
+  allocations_.push_back(info);
+  return alloc;
+}
+
 void CudaDevice::dealloc_memory(DeviceAllocation handle) {
   validate_device_alloc(handle);
   AllocInfo &info = allocations_[handle.alloc_id];
@@ -56,6 +78,13 @@ DeviceAllocation CudaDevice::import_memory(void *ptr, size_t size) {
   return alloc;
 }
 
+uint64 CudaDevice::fetch_result_uint64(int i, uint64 *result_buffer) {
+  CUDADriver::get_instance().stream_synchronize(nullptr);
+  uint64 ret;
+  CUDADriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
+                                                   sizeof(uint64));
+  return ret;
+}
 }  // namespace cuda
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/cuda/cuda_device.h b/taichi/backends/cuda/cuda_device.h
@@ -87,12 +87,18 @@ class CudaDevice : public Device {
   ~CudaDevice() override{};
 
   DeviceAllocation allocate_memory(const AllocParams &params) override;
+  DeviceAllocation allocate_memory_runtime(const AllocParams &params,
+                                           JITModule *runtime_jit_module,
+                                           LLVMRuntime *runtime,
+                                           uint64 *result_buffer) override;
   void dealloc_memory(DeviceAllocation handle) override;
 
   std::unique_ptr<Pipeline> create_pipeline(
       const PipelineSourceDesc &src,
       std::string name = "Pipeline") override{TI_NOT_IMPLEMENTED};
 
+  uint64 fetch_result_uint64(int i, uint64 *result_buffer) override;
+
   void *map_range(DevicePtr ptr, uint64_t size) override{TI_NOT_IMPLEMENTED};
   void *map(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED};
 

diff --git a/taichi/backends/device.cpp b/taichi/backends/device.cpp
@@ -127,6 +127,16 @@ void Device::print_all_cap() const {
   }
 }
 
+uint64_t *Device::allocate_llvm_runtime_memory_jit(JITModule *runtime_jit,
+                                                   LLVMRuntime *runtime,
+                                                   size_t size,
+                                                   uint64 *result_buffer) {
+  runtime_jit->call<void *, std::size_t, std::size_t>(
+      "runtime_memory_allocate_aligned", runtime, size, taichi_page_size);
+  return taichi_union_cast_with_different_sizes<uint64_t *>(fetch_result_uint64(
+      taichi_result_buffer_runtime_query_id, result_buffer));
+}
+
 void GraphicsDevice::image_transition(DeviceAllocation img,
                                       ImageLayout old_layout,
                                       ImageLayout new_layout) {

diff --git a/taichi/backends/device.h b/taichi/backends/device.h
@@ -1,6 +1,7 @@
 #pragma once
 #include "taichi/lang_util.h"
 
+#include "taichi/jit/jit_module.h"
 #include "taichi/program/compile_config.h"
 #include <string>
 #include <vector>
@@ -41,6 +42,7 @@ enum class DeviceCapability : uint32_t {
 class Device;
 struct DeviceAllocation;
 struct DevicePtr;
+struct LLVMRuntime;
 
 // TODO: Figure out how to support images. Temporary solutions is to have all
 // opque types such as images work as an allocation
@@ -386,8 +388,19 @@ class Device {
   };
 
   virtual DeviceAllocation allocate_memory(const AllocParams &params) = 0;
+  virtual DeviceAllocation allocate_memory_runtime(const AllocParams &params,
+                                                   JITModule *runtime_jit,
+                                                   LLVMRuntime *runtime,
+                                                   uint64 *result_buffer) {
+    TI_NOT_IMPLEMENTED
+  }
   virtual void dealloc_memory(DeviceAllocation handle) = 0;
 
+  uint64_t *allocate_llvm_runtime_memory_jit(JITModule *runtime_jit,
+                                             LLVMRuntime *runtime,
+                                             size_t size,
+                                             uint64 *result_buffer);
+
   virtual std::unique_ptr<Pipeline> create_pipeline(
       const PipelineSourceDesc &src,
       std::string name = "Pipeline") = 0;
@@ -398,6 +411,10 @@ class Device {
         this->allocate_memory(params));
   }
 
+  virtual uint64 fetch_result_uint64(int i, uint64 *result_buffer) {
+    TI_NOT_IMPLEMENTED
+  }
+
   // Mapping can fail and will return nullptr
   virtual void *map_range(DevicePtr ptr, uint64_t size) = 0;
   virtual void *map(DeviceAllocation alloc) = 0;

diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
@@ -473,9 +473,6 @@ void LlvmProgramImpl::finalize() {
     cuda_device()->dealloc_memory(preallocated_device_buffer_alloc);
   }
 #endif
-  for (auto &alloc : ndarray_allocs_) {
-    get_compute_device()->dealloc_memory(alloc);
-  }
 }
 
 void LlvmProgramImpl::print_memory_profiler_info(
@@ -568,13 +565,20 @@ DevicePtr LlvmProgramImpl::get_snode_tree_device_ptr(int tree_id) {
 }
 
 DeviceAllocation LlvmProgramImpl::allocate_memory_ndarray(
-    std::size_t alloc_size) {
+    std::size_t alloc_size,
+    uint64 *result_buffer) {
+  TaichiLLVMContext *tlctx = nullptr;
+  if (llvm_context_device) {
+    tlctx = llvm_context_device.get();
+  } else {
+    tlctx = llvm_context_host.get();
+  }
+
   Device::AllocParams device_buffer_alloc_params;
   device_buffer_alloc_params.size = alloc_size;
-  DeviceAllocation alloc =
-      get_compute_device()->allocate_memory(device_buffer_alloc_params);
-  ndarray_allocs_.push_back(alloc);
-  return alloc;
+  return get_compute_device()->allocate_memory_runtime(
+      device_buffer_alloc_params, tlctx->runtime_jit_module, get_llvm_runtime(),
+      result_buffer);
 }
 
 uint64_t *LlvmProgramImpl::get_ndarray_alloc_info_ptr(DeviceAllocation &alloc) {

diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
@@ -97,7 +97,8 @@ class LlvmProgramImpl : public ProgramImpl {
 
   void finalize();
 
-  DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size);
+  DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
+                                           uint64 *result_buffer);
 
   uint64_t *get_ndarray_alloc_info_ptr(DeviceAllocation &alloc);
 
@@ -158,7 +159,6 @@ class LlvmProgramImpl : public ProgramImpl {
   DeviceAllocation preallocated_device_buffer_alloc{kDeviceNullAllocation};
 
   std::unordered_map<int, DeviceAllocation> snode_tree_allocs_;
-  std::vector<DeviceAllocation> ndarray_allocs_;
 
   std::unique_ptr<Device> device_;
   cuda::CudaDevice *cuda_device();

diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
@@ -17,8 +17,9 @@ Ndarray::Ndarray(Program *prog,
                                 std::multiplies<>())),
       element_size_(data_type_size(dtype)) {
   LlvmProgramImpl *prog_impl = prog->get_llvm_program_impl();
-  ndarray_alloc_ =
-      prog_impl->allocate_memory_ndarray(nelement_ * element_size_);
+  ndarray_alloc_ = prog_impl->allocate_memory_ndarray(nelement_ * element_size_,
+                                                      prog->result_buffer);
+
   data_ptr_ = prog_impl->get_ndarray_alloc_info_ptr(ndarray_alloc_);
 }