taichi-dev · qiao-bo · Nov 9, 2021 · Nov 5, 2021 · Nov 5, 2021 · Nov 5, 2021
diff --git a/taichi/backends/cpu/cpu_device.cpp b/taichi/backends/cpu/cpu_device.cpp
@@ -26,6 +26,28 @@ DeviceAllocation CpuDevice::allocate_memory(const AllocParams &params) {
   return alloc;
 }
 
+DeviceAllocation CpuDevice::allocate_memory_runtime(const AllocParams &params,
+                                                    JITModule *runtime_jit,
+                                                    LLVMRuntime *runtime,
+                                                    uint64 *result_buffer) {
+  AllocInfo info;
+  runtime_jit->call<void *, std::size_t, std::size_t>(
+      "runtime_memory_allocate_aligned", runtime, params.size,
+      taichi_page_size);
+  info.ptr =
+      taichi_union_cast_with_different_sizes<uint64_t *>(fetch_result_uint64(
+          taichi_result_buffer_runtime_query_id, result_buffer));
+
+  info.size = params.size;
+
+  DeviceAllocation alloc;
+  alloc.alloc_id = allocations_.size();
+  alloc.device = this;
+
+  allocations_.push_back(info);
+  return alloc;
+}
+
 void CpuDevice::dealloc_memory(DeviceAllocation handle) {
   validate_device_alloc(handle);
   AllocInfo &info = allocations_[handle.alloc_id];
@@ -50,6 +72,11 @@ DeviceAllocation CpuDevice::import_memory(void *ptr, size_t size) {
   return alloc;
 }
 
+uint64 CpuDevice::fetch_result_uint64(int i, uint64 *result_buffer) {
+  uint64 ret = result_buffer[i];
+  return ret;
+}
+
 }  // namespace cpu
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/cpu/cpu_device.h b/taichi/backends/cpu/cpu_device.h
@@ -6,6 +6,8 @@
 
 #include "taichi/common/core.h"
 #include "taichi/backends/device.h"
+#include "taichi/jit/jit_session.h"
+#include "taichi/llvm/llvm_program.h"
 #include "taichi/system/virtual_memory.h"
 
 namespace taichi {
@@ -88,6 +90,10 @@ class CpuDevice : public Device {
 
   DeviceAllocation allocate_memory(const AllocParams &params) override;
   void dealloc_memory(DeviceAllocation handle) override;
+  DeviceAllocation allocate_memory_runtime(const AllocParams &params,
+                                           JITModule *runtime_jit_module,
+                                           LLVMRuntime *runtime,
+                                           uint64 *result_buffer);
 
   std::unique_ptr<Pipeline> create_pipeline(
       const PipelineSourceDesc &src,
@@ -116,6 +122,7 @@ class CpuDevice : public Device {
       TI_ERROR("invalid DeviceAllocation");
     }
   }
+  uint64 fetch_result_uint64(int i, uint64 *result_buffer);
 };
 
 }  // namespace cpu

diff --git a/taichi/backends/cuda/cuda_device.cpp b/taichi/backends/cuda/cuda_device.cpp
@@ -31,6 +31,33 @@ DeviceAllocation CudaDevice::allocate_memory(const AllocParams &params) {
   return alloc;
 }
 
+DeviceAllocation CudaDevice::allocate_memory_runtime(const AllocParams &params,
+                                                     JITModule *runtime_jit,
+                                                     LLVMRuntime *runtime,
+                                                     uint64 *result_buffer) {
+  AllocInfo info;
+  if (params.host_read || params.host_write) {
+    TI_NOT_IMPLEMENTED
+  } else {
+    runtime_jit->call<void *, std::size_t, std::size_t>(
+        "runtime_memory_allocate_aligned", runtime, params.size,
+        taichi_page_size);
+    info.ptr =
+        taichi_union_cast_with_different_sizes<uint64_t *>(fetch_result_uint64(
+            taichi_result_buffer_runtime_query_id, result_buffer));
+  }
+
+  info.size = params.size;
+  info.is_imported = false;
+
+  DeviceAllocation alloc;
+  alloc.alloc_id = allocations_.size();
+  alloc.device = this;
+
+  allocations_.push_back(info);
+  return alloc;
+}
+
 void CudaDevice::dealloc_memory(DeviceAllocation handle) {
   validate_device_alloc(handle);
   AllocInfo &info = allocations_[handle.alloc_id];
@@ -56,6 +83,13 @@ DeviceAllocation CudaDevice::import_memory(void *ptr, size_t size) {
   return alloc;
 }
 
+uint64 CudaDevice::fetch_result_uint64(int i, uint64 *result_buffer) {
+  CUDADriver::get_instance().stream_synchronize(nullptr);
+  uint64 ret;
+  CUDADriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
+                                                   sizeof(uint64));
+  return ret;
+}
 }  // namespace cuda
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/cuda/cuda_device.h b/taichi/backends/cuda/cuda_device.h
@@ -5,6 +5,7 @@
 #include "taichi/common/core.h"
 #include "taichi/backends/cuda/cuda_driver.h"
 #include "taichi/backends/cuda/cuda_context.h"
+#include "taichi/backends/cuda/jit_cuda.h"
 #include "taichi/backends/device.h"
 
 namespace taichi {
@@ -88,6 +89,10 @@ class CudaDevice : public Device {
 
   DeviceAllocation allocate_memory(const AllocParams &params) override;
   void dealloc_memory(DeviceAllocation handle) override;
+  DeviceAllocation allocate_memory_runtime(const AllocParams &params,
+                                           JITModule *runtime_jit_module,
+                                           LLVMRuntime *runtime,
+                                           uint64 *result_buffer);
 
   std::unique_ptr<Pipeline> create_pipeline(
       const PipelineSourceDesc &src,
@@ -113,6 +118,7 @@ class CudaDevice : public Device {
       TI_ERROR("invalid DeviceAllocation");
     }
   }
+  uint64 fetch_result_uint64(int i, uint64 *result_buffer);
 };
 
 }  // namespace cuda

diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
@@ -473,9 +473,6 @@ void LlvmProgramImpl::finalize() {
     cuda_device()->dealloc_memory(preallocated_device_buffer_alloc);
   }
 #endif
-  for (auto &alloc : ndarray_allocs_) {
-    get_compute_device()->dealloc_memory(alloc);
-  }
 }
 
 void LlvmProgramImpl::print_memory_profiler_info(
@@ -568,13 +565,30 @@ DevicePtr LlvmProgramImpl::get_snode_tree_device_ptr(int tree_id) {
 }
 
 DeviceAllocation LlvmProgramImpl::allocate_memory_ndarray(
-    std::size_t alloc_size) {
+    std::size_t alloc_size,
+    uint64 *result_buffer) {
+  TaichiLLVMContext *tlctx = nullptr;
+  if (llvm_context_device) {
+    tlctx = llvm_context_device.get();
+  } else {
+    tlctx = llvm_context_host.get();
+  }
+
   Device::AllocParams device_buffer_alloc_params;
   device_buffer_alloc_params.size = alloc_size;
-  DeviceAllocation alloc =
-      get_compute_device()->allocate_memory(device_buffer_alloc_params);
-  ndarray_allocs_.push_back(alloc);
-  return alloc;
+  if (config->arch == Arch::cuda) {
+#if defined(TI_WITH_CUDA)
+    return cuda_device()->allocate_memory_runtime(
+        device_buffer_alloc_params, tlctx->runtime_jit_module,
+        get_llvm_runtime(), result_buffer);
+#else
+    TI_NOT_IMPLEMENTED
+#endif
+  } else {
+    return cpu_device()->allocate_memory_runtime(
+        device_buffer_alloc_params, tlctx->runtime_jit_module,
+        get_llvm_runtime(), result_buffer);
+  }
 }
 
 uint64_t *LlvmProgramImpl::get_ndarray_alloc_info_ptr(DeviceAllocation &alloc) {

diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
@@ -97,7 +97,8 @@ class LlvmProgramImpl : public ProgramImpl {
 
   void finalize();
 
-  DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size);
+  DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
+                                           uint64 *result_buffer);
 
   uint64_t *get_ndarray_alloc_info_ptr(DeviceAllocation &alloc);
 
@@ -158,7 +159,6 @@ class LlvmProgramImpl : public ProgramImpl {
   DeviceAllocation preallocated_device_buffer_alloc{kDeviceNullAllocation};
 
   std::unordered_map<int, DeviceAllocation> snode_tree_allocs_;
-  std::vector<DeviceAllocation> ndarray_allocs_;
 
   std::unique_ptr<Device> device_;
   cuda::CudaDevice *cuda_device();

diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
@@ -17,8 +17,9 @@ Ndarray::Ndarray(Program *prog,
                                 std::multiplies<>())),
       element_size_(data_type_size(dtype)) {
   LlvmProgramImpl *prog_impl = prog->get_llvm_program_impl();
-  ndarray_alloc_ =
-      prog_impl->allocate_memory_ndarray(nelement_ * element_size_);
+  ndarray_alloc_ = prog_impl->allocate_memory_ndarray(nelement_ * element_size_,
+                                                      prog->result_buffer);
+
   data_ptr_ = prog_impl->get_ndarray_alloc_info_ptr(ndarray_alloc_);
 }