Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[lang] [refactor] Use preallocated memory via device allocation for Ndarray #3395

Merged
merged 11 commits into from
Nov 9, 2021
22 changes: 22 additions & 0 deletions taichi/backends/cpu/cpu_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,23 @@ DeviceAllocation CpuDevice::allocate_memory(const AllocParams &params) {
return alloc;
}

DeviceAllocation CpuDevice::allocate_memory_runtime(const AllocParams &params,
qiao-bo marked this conversation as resolved.
Show resolved Hide resolved
JITModule *runtime_jit,
LLVMRuntime *runtime,
uint64 *result_buffer) {
AllocInfo info;
info.ptr = allocate_llvm_runtime_memory_jit(runtime_jit, runtime, params.size,
result_buffer);
info.size = params.size;

DeviceAllocation alloc;
alloc.alloc_id = allocations_.size();
alloc.device = this;

allocations_.push_back(info);
return alloc;
}

void CpuDevice::dealloc_memory(DeviceAllocation handle) {
validate_device_alloc(handle);
AllocInfo &info = allocations_[handle.alloc_id];
Expand All @@ -50,6 +67,11 @@ DeviceAllocation CpuDevice::import_memory(void *ptr, size_t size) {
return alloc;
}

uint64 CpuDevice::fetch_result_uint64(int i, uint64 *result_buffer) {
uint64 ret = result_buffer[i];
return ret;
}

} // namespace cpu
} // namespace lang
} // namespace taichi
6 changes: 6 additions & 0 deletions taichi/backends/cpu/cpu_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,18 @@ class CpuDevice : public Device {
~CpuDevice() override{};

DeviceAllocation allocate_memory(const AllocParams &params) override;
DeviceAllocation allocate_memory_runtime(const AllocParams &params,
JITModule *runtime_jit_module,
LLVMRuntime *runtime,
uint64 *result_buffer) override;
void dealloc_memory(DeviceAllocation handle) override;

std::unique_ptr<Pipeline> create_pipeline(
const PipelineSourceDesc &src,
std::string name = "Pipeline") override{TI_NOT_IMPLEMENTED};

uint64 fetch_result_uint64(int i, uint64 *result_buffer) override;

void *map_range(DevicePtr ptr, uint64_t size) override{TI_NOT_IMPLEMENTED};
void *map(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED};

Expand Down
29 changes: 29 additions & 0 deletions taichi/backends/cuda/cuda_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,28 @@ DeviceAllocation CudaDevice::allocate_memory(const AllocParams &params) {
return alloc;
}

DeviceAllocation CudaDevice::allocate_memory_runtime(const AllocParams &params,
JITModule *runtime_jit,
LLVMRuntime *runtime,
uint64 *result_buffer) {
AllocInfo info;
if (params.host_read || params.host_write) {
qiao-bo marked this conversation as resolved.
Show resolved Hide resolved
TI_NOT_IMPLEMENTED
} else {
info.ptr = allocate_llvm_runtime_memory_jit(runtime_jit, runtime,
params.size, result_buffer);
}
info.size = params.size;
info.is_imported = false;

DeviceAllocation alloc;
alloc.alloc_id = allocations_.size();
alloc.device = this;

allocations_.push_back(info);
return alloc;
}

void CudaDevice::dealloc_memory(DeviceAllocation handle) {
validate_device_alloc(handle);
AllocInfo &info = allocations_[handle.alloc_id];
Expand All @@ -56,6 +78,13 @@ DeviceAllocation CudaDevice::import_memory(void *ptr, size_t size) {
return alloc;
}

uint64 CudaDevice::fetch_result_uint64(int i, uint64 *result_buffer) {
CUDADriver::get_instance().stream_synchronize(nullptr);
uint64 ret;
CUDADriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
sizeof(uint64));
return ret;
}
} // namespace cuda
} // namespace lang
} // namespace taichi
6 changes: 6 additions & 0 deletions taichi/backends/cuda/cuda_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,18 @@ class CudaDevice : public Device {
~CudaDevice() override{};

DeviceAllocation allocate_memory(const AllocParams &params) override;
DeviceAllocation allocate_memory_runtime(const AllocParams &params,
JITModule *runtime_jit_module,
LLVMRuntime *runtime,
uint64 *result_buffer) override;
void dealloc_memory(DeviceAllocation handle) override;

std::unique_ptr<Pipeline> create_pipeline(
const PipelineSourceDesc &src,
std::string name = "Pipeline") override{TI_NOT_IMPLEMENTED};

uint64 fetch_result_uint64(int i, uint64 *result_buffer) override;

void *map_range(DevicePtr ptr, uint64_t size) override{TI_NOT_IMPLEMENTED};
void *map(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED};

Expand Down
10 changes: 10 additions & 0 deletions taichi/backends/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,16 @@ void Device::print_all_cap() const {
}
}

uint64_t *Device::allocate_llvm_runtime_memory_jit(JITModule *runtime_jit,
LLVMRuntime *runtime,
size_t size,
uint64 *result_buffer) {
runtime_jit->call<void *, std::size_t, std::size_t>(
"runtime_memory_allocate_aligned", runtime, size, taichi_page_size);
return taichi_union_cast_with_different_sizes<uint64_t *>(fetch_result_uint64(
taichi_result_buffer_runtime_query_id, result_buffer));
}

void GraphicsDevice::image_transition(DeviceAllocation img,
ImageLayout old_layout,
ImageLayout new_layout) {
Expand Down
17 changes: 17 additions & 0 deletions taichi/backends/device.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once
#include "taichi/lang_util.h"

#include "taichi/jit/jit_module.h"
#include "taichi/program/compile_config.h"
#include <string>
#include <vector>
Expand Down Expand Up @@ -41,6 +42,7 @@ enum class DeviceCapability : uint32_t {
class Device;
struct DeviceAllocation;
struct DevicePtr;
struct LLVMRuntime;

// TODO: Figure out how to support images. Temporary solutions is to have all
// opque types such as images work as an allocation
Expand Down Expand Up @@ -386,8 +388,19 @@ class Device {
};

virtual DeviceAllocation allocate_memory(const AllocParams &params) = 0;
virtual DeviceAllocation allocate_memory_runtime(const AllocParams &params,
JITModule *runtime_jit,
LLVMRuntime *runtime,
uint64 *result_buffer) {
TI_NOT_IMPLEMENTED
}
virtual void dealloc_memory(DeviceAllocation handle) = 0;

uint64_t *allocate_llvm_runtime_memory_jit(JITModule *runtime_jit,
LLVMRuntime *runtime,
size_t size,
uint64 *result_buffer);

virtual std::unique_ptr<Pipeline> create_pipeline(
const PipelineSourceDesc &src,
std::string name = "Pipeline") = 0;
Expand All @@ -398,6 +411,10 @@ class Device {
this->allocate_memory(params));
}

virtual uint64 fetch_result_uint64(int i, uint64 *result_buffer) {
TI_NOT_IMPLEMENTED
}

// Mapping can fail and will return nullptr
virtual void *map_range(DevicePtr ptr, uint64_t size) = 0;
virtual void *map(DeviceAllocation alloc) = 0;
Expand Down
20 changes: 12 additions & 8 deletions taichi/llvm/llvm_program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -473,9 +473,6 @@ void LlvmProgramImpl::finalize() {
cuda_device()->dealloc_memory(preallocated_device_buffer_alloc);
}
#endif
for (auto &alloc : ndarray_allocs_) {
get_compute_device()->dealloc_memory(alloc);
}
}

void LlvmProgramImpl::print_memory_profiler_info(
Expand Down Expand Up @@ -568,13 +565,20 @@ DevicePtr LlvmProgramImpl::get_snode_tree_device_ptr(int tree_id) {
}

DeviceAllocation LlvmProgramImpl::allocate_memory_ndarray(
std::size_t alloc_size) {
std::size_t alloc_size,
uint64 *result_buffer) {
TaichiLLVMContext *tlctx = nullptr;
if (llvm_context_device) {
tlctx = llvm_context_device.get();
} else {
tlctx = llvm_context_host.get();
}

Device::AllocParams device_buffer_alloc_params;
device_buffer_alloc_params.size = alloc_size;
DeviceAllocation alloc =
get_compute_device()->allocate_memory(device_buffer_alloc_params);
ndarray_allocs_.push_back(alloc);
return alloc;
return get_compute_device()->allocate_memory_runtime(
device_buffer_alloc_params, tlctx->runtime_jit_module, get_llvm_runtime(),
result_buffer);
}

uint64_t *LlvmProgramImpl::get_ndarray_alloc_info_ptr(DeviceAllocation &alloc) {
Expand Down
4 changes: 2 additions & 2 deletions taichi/llvm/llvm_program.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ class LlvmProgramImpl : public ProgramImpl {

void finalize();

DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size);
DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
uint64 *result_buffer);

uint64_t *get_ndarray_alloc_info_ptr(DeviceAllocation &alloc);

Expand Down Expand Up @@ -158,7 +159,6 @@ class LlvmProgramImpl : public ProgramImpl {
DeviceAllocation preallocated_device_buffer_alloc{kDeviceNullAllocation};

std::unordered_map<int, DeviceAllocation> snode_tree_allocs_;
std::vector<DeviceAllocation> ndarray_allocs_;

std::unique_ptr<Device> device_;
cuda::CudaDevice *cuda_device();
Expand Down
5 changes: 3 additions & 2 deletions taichi/program/ndarray.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ Ndarray::Ndarray(Program *prog,
std::multiplies<>())),
element_size_(data_type_size(dtype)) {
LlvmProgramImpl *prog_impl = prog->get_llvm_program_impl();
ndarray_alloc_ =
prog_impl->allocate_memory_ndarray(nelement_ * element_size_);
ndarray_alloc_ = prog_impl->allocate_memory_ndarray(nelement_ * element_size_,
prog->result_buffer);

data_ptr_ = prog_impl->get_ndarray_alloc_info_ptr(ndarray_alloc_);
}

Expand Down