Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[lang] [refactor] Use preallocated memory via device allocation for Ndarray #3395

Merged
merged 11 commits into from
Nov 9, 2021
27 changes: 27 additions & 0 deletions taichi/backends/cpu/cpu_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,28 @@ DeviceAllocation CpuDevice::allocate_memory(const AllocParams &params) {
return alloc;
}

DeviceAllocation CpuDevice::allocate_memory_runtime(const AllocParams &params,
qiao-bo marked this conversation as resolved.
Show resolved Hide resolved
JITModule *runtime_jit,
LLVMRuntime *runtime,
uint64 *result_buffer) {
AllocInfo info;
runtime_jit->call<void *, std::size_t, std::size_t>(
"runtime_memory_allocate_aligned", runtime, params.size,
taichi_page_size);
info.ptr =
taichi_union_cast_with_different_sizes<uint64_t *>(fetch_result_uint64(
taichi_result_buffer_runtime_query_id, result_buffer));

info.size = params.size;

DeviceAllocation alloc;
alloc.alloc_id = allocations_.size();
alloc.device = this;

allocations_.push_back(info);
return alloc;
}

void CpuDevice::dealloc_memory(DeviceAllocation handle) {
validate_device_alloc(handle);
AllocInfo &info = allocations_[handle.alloc_id];
Expand All @@ -50,6 +72,11 @@ DeviceAllocation CpuDevice::import_memory(void *ptr, size_t size) {
return alloc;
}

uint64 CpuDevice::fetch_result_uint64(int i, uint64 *result_buffer) {
uint64 ret = result_buffer[i];
return ret;
}

} // namespace cpu
} // namespace lang
} // namespace taichi
7 changes: 7 additions & 0 deletions taichi/backends/cpu/cpu_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#include "taichi/common/core.h"
#include "taichi/backends/device.h"
#include "taichi/jit/jit_session.h"
#include "taichi/llvm/llvm_program.h"
#include "taichi/system/virtual_memory.h"

namespace taichi {
Expand Down Expand Up @@ -88,6 +90,10 @@ class CpuDevice : public Device {

DeviceAllocation allocate_memory(const AllocParams &params) override;
void dealloc_memory(DeviceAllocation handle) override;
DeviceAllocation allocate_memory_runtime(const AllocParams &params,
JITModule *runtime_jit_module,
LLVMRuntime *runtime,
uint64 *result_buffer);

std::unique_ptr<Pipeline> create_pipeline(
const PipelineSourceDesc &src,
Expand Down Expand Up @@ -116,6 +122,7 @@ class CpuDevice : public Device {
TI_ERROR("invalid DeviceAllocation");
}
}
uint64 fetch_result_uint64(int i, uint64 *result_buffer);
};

} // namespace cpu
Expand Down
34 changes: 34 additions & 0 deletions taichi/backends/cuda/cuda_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,33 @@ DeviceAllocation CudaDevice::allocate_memory(const AllocParams &params) {
return alloc;
}

DeviceAllocation CudaDevice::allocate_memory_runtime(const AllocParams &params,
JITModule *runtime_jit,
LLVMRuntime *runtime,
uint64 *result_buffer) {
AllocInfo info;
if (params.host_read || params.host_write) {
qiao-bo marked this conversation as resolved.
Show resolved Hide resolved
TI_NOT_IMPLEMENTED
} else {
runtime_jit->call<void *, std::size_t, std::size_t>(
"runtime_memory_allocate_aligned", runtime, params.size,
taichi_page_size);
info.ptr =
taichi_union_cast_with_different_sizes<uint64_t *>(fetch_result_uint64(
taichi_result_buffer_runtime_query_id, result_buffer));
}

info.size = params.size;
info.is_imported = false;

DeviceAllocation alloc;
alloc.alloc_id = allocations_.size();
alloc.device = this;

allocations_.push_back(info);
return alloc;
}

void CudaDevice::dealloc_memory(DeviceAllocation handle) {
validate_device_alloc(handle);
AllocInfo &info = allocations_[handle.alloc_id];
Expand All @@ -56,6 +83,13 @@ DeviceAllocation CudaDevice::import_memory(void *ptr, size_t size) {
return alloc;
}

uint64 CudaDevice::fetch_result_uint64(int i, uint64 *result_buffer) {
CUDADriver::get_instance().stream_synchronize(nullptr);
uint64 ret;
CUDADriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
sizeof(uint64));
return ret;
}
} // namespace cuda
} // namespace lang
} // namespace taichi
6 changes: 6 additions & 0 deletions taichi/backends/cuda/cuda_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "taichi/common/core.h"
#include "taichi/backends/cuda/cuda_driver.h"
#include "taichi/backends/cuda/cuda_context.h"
#include "taichi/backends/cuda/jit_cuda.h"
#include "taichi/backends/device.h"

namespace taichi {
Expand Down Expand Up @@ -88,6 +89,10 @@ class CudaDevice : public Device {

DeviceAllocation allocate_memory(const AllocParams &params) override;
void dealloc_memory(DeviceAllocation handle) override;
DeviceAllocation allocate_memory_runtime(const AllocParams &params,
JITModule *runtime_jit_module,
LLVMRuntime *runtime,
uint64 *result_buffer);

std::unique_ptr<Pipeline> create_pipeline(
const PipelineSourceDesc &src,
Expand All @@ -113,6 +118,7 @@ class CudaDevice : public Device {
TI_ERROR("invalid DeviceAllocation");
}
}
uint64 fetch_result_uint64(int i, uint64 *result_buffer);
};

} // namespace cuda
Expand Down
30 changes: 22 additions & 8 deletions taichi/llvm/llvm_program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -473,9 +473,6 @@ void LlvmProgramImpl::finalize() {
cuda_device()->dealloc_memory(preallocated_device_buffer_alloc);
}
#endif
for (auto &alloc : ndarray_allocs_) {
get_compute_device()->dealloc_memory(alloc);
}
}

void LlvmProgramImpl::print_memory_profiler_info(
Expand Down Expand Up @@ -568,13 +565,30 @@ DevicePtr LlvmProgramImpl::get_snode_tree_device_ptr(int tree_id) {
}

DeviceAllocation LlvmProgramImpl::allocate_memory_ndarray(
std::size_t alloc_size) {
std::size_t alloc_size,
uint64 *result_buffer) {
TaichiLLVMContext *tlctx = nullptr;
if (llvm_context_device) {
tlctx = llvm_context_device.get();
} else {
tlctx = llvm_context_host.get();
}

Device::AllocParams device_buffer_alloc_params;
device_buffer_alloc_params.size = alloc_size;
DeviceAllocation alloc =
get_compute_device()->allocate_memory(device_buffer_alloc_params);
ndarray_allocs_.push_back(alloc);
return alloc;
if (config->arch == Arch::cuda) {
#if defined(TI_WITH_CUDA)
return cuda_device()->allocate_memory_runtime(
device_buffer_alloc_params, tlctx->runtime_jit_module,
get_llvm_runtime(), result_buffer);
#else
TI_NOT_IMPLEMENTED
#endif
} else {
return cpu_device()->allocate_memory_runtime(
device_buffer_alloc_params, tlctx->runtime_jit_module,
get_llvm_runtime(), result_buffer);
}
}

uint64_t *LlvmProgramImpl::get_ndarray_alloc_info_ptr(DeviceAllocation &alloc) {
Expand Down
4 changes: 2 additions & 2 deletions taichi/llvm/llvm_program.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ class LlvmProgramImpl : public ProgramImpl {

void finalize();

DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size);
DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
uint64 *result_buffer);

uint64_t *get_ndarray_alloc_info_ptr(DeviceAllocation &alloc);

Expand Down Expand Up @@ -158,7 +159,6 @@ class LlvmProgramImpl : public ProgramImpl {
DeviceAllocation preallocated_device_buffer_alloc{kDeviceNullAllocation};

std::unordered_map<int, DeviceAllocation> snode_tree_allocs_;
std::vector<DeviceAllocation> ndarray_allocs_;

std::unique_ptr<Device> device_;
cuda::CudaDevice *cuda_device();
Expand Down
5 changes: 3 additions & 2 deletions taichi/program/ndarray.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ Ndarray::Ndarray(Program *prog,
std::multiplies<>())),
element_size_(data_type_size(dtype)) {
LlvmProgramImpl *prog_impl = prog->get_llvm_program_impl();
ndarray_alloc_ =
prog_impl->allocate_memory_ndarray(nelement_ * element_size_);
ndarray_alloc_ = prog_impl->allocate_memory_ndarray(nelement_ * element_size_,
prog->result_buffer);

data_ptr_ = prog_impl->get_ndarray_alloc_info_ptr(ndarray_alloc_);
}

Expand Down