Skip to content

Commit

Permalink
[lang] [refactor] Use preallocated memory via device allocation for N…
Browse files Browse the repository at this point in the history
…darray (#3395)

* Use preallocated memory from runtime

* Auto Format

* Add cuda guard

* Unify the memory runtime function

* Auto Format

* Add a helper for cpu and cuda mem alloc

* Auto Format

Co-authored-by: Taichi Gardener <taichigardener@gmail.com>
  • Loading branch information
qiao-bo and taichi-gardener authored Nov 9, 2021
1 parent ceb61f3 commit c7c97e2
Show file tree
Hide file tree
Showing 9 changed files with 107 additions and 12 deletions.
22 changes: 22 additions & 0 deletions taichi/backends/cpu/cpu_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,23 @@ DeviceAllocation CpuDevice::allocate_memory(const AllocParams &params) {
return alloc;
}

DeviceAllocation CpuDevice::allocate_memory_runtime(const AllocParams &params,
JITModule *runtime_jit,
LLVMRuntime *runtime,
uint64 *result_buffer) {
AllocInfo info;
info.ptr = allocate_llvm_runtime_memory_jit(runtime_jit, runtime, params.size,
result_buffer);
info.size = params.size;

DeviceAllocation alloc;
alloc.alloc_id = allocations_.size();
alloc.device = this;

allocations_.push_back(info);
return alloc;
}

void CpuDevice::dealloc_memory(DeviceAllocation handle) {
validate_device_alloc(handle);
AllocInfo &info = allocations_[handle.alloc_id];
Expand All @@ -50,6 +67,11 @@ DeviceAllocation CpuDevice::import_memory(void *ptr, size_t size) {
return alloc;
}

uint64 CpuDevice::fetch_result_uint64(int i, uint64 *result_buffer) {
uint64 ret = result_buffer[i];
return ret;
}

} // namespace cpu
} // namespace lang
} // namespace taichi
6 changes: 6 additions & 0 deletions taichi/backends/cpu/cpu_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,18 @@ class CpuDevice : public Device {
~CpuDevice() override{};

DeviceAllocation allocate_memory(const AllocParams &params) override;
DeviceAllocation allocate_memory_runtime(const AllocParams &params,
JITModule *runtime_jit_module,
LLVMRuntime *runtime,
uint64 *result_buffer) override;
void dealloc_memory(DeviceAllocation handle) override;

std::unique_ptr<Pipeline> create_pipeline(
const PipelineSourceDesc &src,
std::string name = "Pipeline") override{TI_NOT_IMPLEMENTED};

uint64 fetch_result_uint64(int i, uint64 *result_buffer) override;

void *map_range(DevicePtr ptr, uint64_t size) override{TI_NOT_IMPLEMENTED};
void *map(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED};

Expand Down
29 changes: 29 additions & 0 deletions taichi/backends/cuda/cuda_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,28 @@ DeviceAllocation CudaDevice::allocate_memory(const AllocParams &params) {
return alloc;
}

DeviceAllocation CudaDevice::allocate_memory_runtime(const AllocParams &params,
JITModule *runtime_jit,
LLVMRuntime *runtime,
uint64 *result_buffer) {
AllocInfo info;
if (params.host_read || params.host_write) {
TI_NOT_IMPLEMENTED
} else {
info.ptr = allocate_llvm_runtime_memory_jit(runtime_jit, runtime,
params.size, result_buffer);
}
info.size = params.size;
info.is_imported = false;

DeviceAllocation alloc;
alloc.alloc_id = allocations_.size();
alloc.device = this;

allocations_.push_back(info);
return alloc;
}

void CudaDevice::dealloc_memory(DeviceAllocation handle) {
validate_device_alloc(handle);
AllocInfo &info = allocations_[handle.alloc_id];
Expand All @@ -56,6 +78,13 @@ DeviceAllocation CudaDevice::import_memory(void *ptr, size_t size) {
return alloc;
}

uint64 CudaDevice::fetch_result_uint64(int i, uint64 *result_buffer) {
CUDADriver::get_instance().stream_synchronize(nullptr);
uint64 ret;
CUDADriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
sizeof(uint64));
return ret;
}
} // namespace cuda
} // namespace lang
} // namespace taichi
6 changes: 6 additions & 0 deletions taichi/backends/cuda/cuda_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,18 @@ class CudaDevice : public Device {
~CudaDevice() override{};

DeviceAllocation allocate_memory(const AllocParams &params) override;
DeviceAllocation allocate_memory_runtime(const AllocParams &params,
JITModule *runtime_jit_module,
LLVMRuntime *runtime,
uint64 *result_buffer) override;
void dealloc_memory(DeviceAllocation handle) override;

std::unique_ptr<Pipeline> create_pipeline(
const PipelineSourceDesc &src,
std::string name = "Pipeline") override{TI_NOT_IMPLEMENTED};

uint64 fetch_result_uint64(int i, uint64 *result_buffer) override;

void *map_range(DevicePtr ptr, uint64_t size) override{TI_NOT_IMPLEMENTED};
void *map(DeviceAllocation alloc) override{TI_NOT_IMPLEMENTED};

Expand Down
10 changes: 10 additions & 0 deletions taichi/backends/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,16 @@ void Device::print_all_cap() const {
}
}

uint64_t *Device::allocate_llvm_runtime_memory_jit(JITModule *runtime_jit,
LLVMRuntime *runtime,
size_t size,
uint64 *result_buffer) {
runtime_jit->call<void *, std::size_t, std::size_t>(
"runtime_memory_allocate_aligned", runtime, size, taichi_page_size);
return taichi_union_cast_with_different_sizes<uint64_t *>(fetch_result_uint64(
taichi_result_buffer_runtime_query_id, result_buffer));
}

void GraphicsDevice::image_transition(DeviceAllocation img,
ImageLayout old_layout,
ImageLayout new_layout) {
Expand Down
17 changes: 17 additions & 0 deletions taichi/backends/device.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once
#include "taichi/lang_util.h"

#include "taichi/jit/jit_module.h"
#include "taichi/program/compile_config.h"
#include <string>
#include <vector>
Expand Down Expand Up @@ -41,6 +42,7 @@ enum class DeviceCapability : uint32_t {
class Device;
struct DeviceAllocation;
struct DevicePtr;
struct LLVMRuntime;

// TODO: Figure out how to support images. Temporary solutions is to have all
// opque types such as images work as an allocation
Expand Down Expand Up @@ -386,8 +388,19 @@ class Device {
};

virtual DeviceAllocation allocate_memory(const AllocParams &params) = 0;
virtual DeviceAllocation allocate_memory_runtime(const AllocParams &params,
JITModule *runtime_jit,
LLVMRuntime *runtime,
uint64 *result_buffer) {
TI_NOT_IMPLEMENTED
}
virtual void dealloc_memory(DeviceAllocation handle) = 0;

uint64_t *allocate_llvm_runtime_memory_jit(JITModule *runtime_jit,
LLVMRuntime *runtime,
size_t size,
uint64 *result_buffer);

virtual std::unique_ptr<Pipeline> create_pipeline(
const PipelineSourceDesc &src,
std::string name = "Pipeline") = 0;
Expand All @@ -398,6 +411,10 @@ class Device {
this->allocate_memory(params));
}

virtual uint64 fetch_result_uint64(int i, uint64 *result_buffer) {
TI_NOT_IMPLEMENTED
}

// Mapping can fail and will return nullptr
virtual void *map_range(DevicePtr ptr, uint64_t size) = 0;
virtual void *map(DeviceAllocation alloc) = 0;
Expand Down
20 changes: 12 additions & 8 deletions taichi/llvm/llvm_program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -473,9 +473,6 @@ void LlvmProgramImpl::finalize() {
cuda_device()->dealloc_memory(preallocated_device_buffer_alloc);
}
#endif
for (auto &alloc : ndarray_allocs_) {
get_compute_device()->dealloc_memory(alloc);
}
}

void LlvmProgramImpl::print_memory_profiler_info(
Expand Down Expand Up @@ -568,13 +565,20 @@ DevicePtr LlvmProgramImpl::get_snode_tree_device_ptr(int tree_id) {
}

DeviceAllocation LlvmProgramImpl::allocate_memory_ndarray(
std::size_t alloc_size) {
std::size_t alloc_size,
uint64 *result_buffer) {
TaichiLLVMContext *tlctx = nullptr;
if (llvm_context_device) {
tlctx = llvm_context_device.get();
} else {
tlctx = llvm_context_host.get();
}

Device::AllocParams device_buffer_alloc_params;
device_buffer_alloc_params.size = alloc_size;
DeviceAllocation alloc =
get_compute_device()->allocate_memory(device_buffer_alloc_params);
ndarray_allocs_.push_back(alloc);
return alloc;
return get_compute_device()->allocate_memory_runtime(
device_buffer_alloc_params, tlctx->runtime_jit_module, get_llvm_runtime(),
result_buffer);
}

uint64_t *LlvmProgramImpl::get_ndarray_alloc_info_ptr(DeviceAllocation &alloc) {
Expand Down
4 changes: 2 additions & 2 deletions taichi/llvm/llvm_program.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ class LlvmProgramImpl : public ProgramImpl {

void finalize();

DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size);
DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
uint64 *result_buffer);

uint64_t *get_ndarray_alloc_info_ptr(DeviceAllocation &alloc);

Expand Down Expand Up @@ -158,7 +159,6 @@ class LlvmProgramImpl : public ProgramImpl {
DeviceAllocation preallocated_device_buffer_alloc{kDeviceNullAllocation};

std::unordered_map<int, DeviceAllocation> snode_tree_allocs_;
std::vector<DeviceAllocation> ndarray_allocs_;

std::unique_ptr<Device> device_;
cuda::CudaDevice *cuda_device();
Expand Down
5 changes: 3 additions & 2 deletions taichi/program/ndarray.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@ Ndarray::Ndarray(Program *prog,
std::multiplies<>())),
element_size_(data_type_size(dtype)) {
LlvmProgramImpl *prog_impl = prog->get_llvm_program_impl();
ndarray_alloc_ =
prog_impl->allocate_memory_ndarray(nelement_ * element_size_);
ndarray_alloc_ = prog_impl->allocate_memory_ndarray(nelement_ * element_size_,
prog->result_buffer);

data_ptr_ = prog_impl->get_ndarray_alloc_info_ptr(ndarray_alloc_);
}

Expand Down

0 comments on commit c7c97e2

Please sign in to comment.