Skip to content

Commit

Permalink
[amdgpu] Update amdgpu module call (#7022)
Browse files Browse the repository at this point in the history
Issue: ##6434

### Brief Summary
1. Add exclusive way to process kernel argument.
2. Upate `launch` api in `AMDGPUContext` .

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
galeselee and pre-commit-ci[bot] committed Jan 6, 2023
1 parent 328cacf commit 405b700
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 29 deletions.
2 changes: 1 addition & 1 deletion taichi/codegen/cuda/codegen_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,7 @@ FunctionType CUDAModuleToFunctionConverter::convert(
TI_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
task.block_dim);
cuda_module->launch(task.name, task.grid_dim, task.block_dim, 0,
{&context});
{&context}, {});
}

// copy data back to host
Expand Down
2 changes: 1 addition & 1 deletion taichi/inc/archs.inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ PER_ARCH(opengl) // OpenGL Compute Shaders
PER_ARCH(dx11) // Microsoft DirectX 11, WIP
PER_ARCH(dx12) // Microsoft DirectX 12, WIP
PER_ARCH(opencl) // OpenCL, N/A
PER_ARCH(amdgpu) // AMD GPU, N/A
PER_ARCH(amdgpu) // AMD GPU, WIP
PER_ARCH(vulkan) // Vulkan
PER_ARCH(gles) // OpenGL ES
34 changes: 20 additions & 14 deletions taichi/jit/jit_module.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <memory>
#include <functional>
#include <tuple>

#include "taichi/inc/constants.h"
#include "taichi/util/lang_util.h"
Expand Down Expand Up @@ -33,31 +34,36 @@ class JITModule {
return ret;
}

static std::vector<void *> get_arg_pointers() {
return std::vector<void *>();
inline std::tuple<std::vector<void *>, std::vector<int> > get_arg_pointers() {
return std::make_tuple(std::vector<void *>(), std::vector<int>());
}

template <typename... Args, typename T>
static std::vector<void *> get_arg_pointers(T &t, Args &...args) {
auto ret = get_arg_pointers(args...);
ret.insert(ret.begin(), &t);
return ret;
inline std::tuple<std::vector<void *>, std::vector<int> > get_arg_pointers(
T &t,
Args &...args) {
auto [arg_pointers, arg_sizes] = get_arg_pointers(args...);
arg_pointers.insert(arg_pointers.begin(), &t);
arg_sizes.insert(arg_sizes.begin(), sizeof(t));
return std::make_tuple(arg_pointers, arg_sizes);
}

// Note: **call** is for serial functions
// Note: args must pass by value
// Note: AMDGPU need to pass args by extra_arg currently
template <typename... Args>
void call(const std::string &name, Args... args) {
if (direct_dispatch()) {
get_function<Args...>(name)(args...);
} else {
auto arg_pointers = JITModule::get_arg_pointers(args...);
call(name, arg_pointers);
auto [arg_pointers, arg_sizes] = JITModule::get_arg_pointers(args...);
call(name, arg_pointers, arg_sizes);
}
}

virtual void call(const std::string &name,
const std::vector<void *> &arg_pointers) {
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes) {
TI_NOT_IMPLEMENTED
}

Expand All @@ -69,20 +75,20 @@ class JITModule {
std::size_t block_dim,
std::size_t shared_mem_bytes,
Args... args) {
auto arg_pointers = JITModule::get_arg_pointers(args...);
launch(name, grid_dim, block_dim, shared_mem_bytes, arg_pointers);
auto [arg_pointers, arg_sizes] = JITModule::get_arg_pointers(args...);
launch(name, grid_dim, block_dim, shared_mem_bytes, arg_pointers,
arg_sizes);
}

virtual void launch(const std::string &name,
std::size_t grid_dim,
std::size_t block_dim,
std::size_t shared_mem_bytes,
const std::vector<void *> &arg_pointers) {
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes) {
TI_NOT_IMPLEMENTED
}

// directly call the function (e.g. on CPU), or via another runtime system
// (e.g. cudaLaunch)?
virtual bool direct_dispatch() const = 0;

virtual ~JITModule() {
Expand Down
65 changes: 60 additions & 5 deletions taichi/rhi/amdgpu/amdgpu_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,72 @@ std::string AMDGPUContext::get_device_name() {
return str;
}

int AMDGPUContext::get_args_byte(std::vector<int> arg_sizes) {
int byte_cnt = 0;
int naive_add = 0;
for (auto &size : arg_sizes) {
naive_add += size;
if (size < 32) {
if ((byte_cnt + size) % 32 > (byte_cnt) % 32 ||
(byte_cnt + size) % 32 == 0)
byte_cnt += size;
else
byte_cnt += 32 - byte_cnt % 32 + size;
} else {
if (byte_cnt % 32 != 0)
byte_cnt += 32 - byte_cnt % 32 + size;
else
byte_cnt += size;
}
}
return byte_cnt;
}

void AMDGPUContext::pack_args(std::vector<void *> arg_pointers,
std::vector<int> arg_sizes,
char *arg_packed) {
int byte_cnt = 0;
for (int ii = 0; ii < arg_pointers.size(); ii++) {
// The parameter is taken as a vec4
if (arg_sizes[ii] < 32) {
if ((byte_cnt + arg_sizes[ii]) % 32 > (byte_cnt % 32) ||
(byte_cnt + arg_sizes[ii]) % 32 == 0) {
std::memcpy(arg_packed + byte_cnt, arg_pointers[ii], arg_sizes[ii]);
byte_cnt += arg_sizes[ii];
} else {
int padding_size = 32 - byte_cnt % 32;
byte_cnt += padding_size;
std::memcpy(arg_packed + byte_cnt, arg_pointers[ii], arg_sizes[ii]);
byte_cnt += arg_sizes[ii];
}
} else {
if (byte_cnt % 32 != 0) {
int padding_size = 32 - byte_cnt % 32;
byte_cnt += padding_size;
std::memcpy(arg_packed + byte_cnt, arg_pointers[ii], arg_sizes[ii]);
byte_cnt += arg_sizes[ii];
} else {
std::memcpy(arg_packed + byte_cnt, arg_pointers[ii], arg_sizes[ii]);
byte_cnt += arg_sizes[ii];
}
}
}
}

void AMDGPUContext::launch(void *func,
const std::string &task_name,
void *arg_pointers,
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes,
int arg_bytes) {
std::size_t dynamic_shared_mem_bytes) {
auto pack_size = get_args_byte(arg_sizes);
char *packed_arg = (char *)std::malloc(pack_size);
pack_args(arg_pointers, arg_sizes, packed_arg);
if (grid_dim > 0) {
std::lock_guard<std::mutex> _(lock_);
void *config[] = {(void *)0x01, const_cast<void *>(arg_pointers),
(void *)0x02, &arg_bytes, (void *)0x03};
void *config[] = {(void *)0x01, (void *)packed_arg, (void *)0x02,
(void *)&pack_size, (void *)0x03};
driver_.launch_kernel(func, grid_dim, 1, 1, block_dim, 1, 1,
dynamic_shared_mem_bytes, nullptr, nullptr,
reinterpret_cast<void **>(&config));
Expand Down
12 changes: 9 additions & 3 deletions taichi/rhi/amdgpu/amdgpu_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,19 @@ class AMDGPUContext {
return dev_count_ != 0;
}

void pack_args(std::vector<void *> arg_pointers,
std::vector<int> arg_sizes,
char *arg_packed);

int get_args_byte(std::vector<int> arg_sizes);

void launch(void *func,
const std::string &task_name,
void *arg_pointers,
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes,
int arg_bytes);
std::size_t dynamic_shared_mem_bytes);

void set_debug(bool debug) {
debug_ = debug;
Expand Down
1 change: 1 addition & 0 deletions taichi/rhi/cuda/cuda_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ std::string CUDAContext::get_device_name() {
void CUDAContext::launch(void *func,
const std::string &task_name,
std::vector<void *> arg_pointers,
std::vector<int> arg_sizes,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes) {
Expand Down
1 change: 1 addition & 0 deletions taichi/rhi/cuda/cuda_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ class CUDAContext {
void launch(void *func,
const std::string &task_name,
std::vector<void *> arg_pointers,
std::vector<int> arg_sizes,
unsigned grid_dim,
unsigned block_dim,
std::size_t dynamic_shared_mem_bytes);
Expand Down
13 changes: 8 additions & 5 deletions taichi/runtime/cuda/jit_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,21 @@ class JITModuleCUDA : public JITModule {
}

void call(const std::string &name,
const std::vector<void *> &arg_pointers) override {
launch(name, 1, 1, 0, arg_pointers);
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes) override {
launch(name, 1, 1, 0, arg_pointers, arg_sizes);
}

void launch(const std::string &name,
std::size_t grid_dim,
std::size_t block_dim,
std::size_t dynamic_shared_mem_bytes,
const std::vector<void *> &arg_pointers) override {
const std::vector<void *> &arg_pointers,
const std::vector<int> &arg_sizes) override {
auto func = lookup_function(name);
CUDAContext::get_instance().launch(func, name, arg_pointers, grid_dim,
block_dim, dynamic_shared_mem_bytes);
CUDAContext::get_instance().launch(func, name, arg_pointers, arg_sizes,
grid_dim, block_dim,
dynamic_shared_mem_bytes);
}

bool direct_dispatch() const override {
Expand Down

0 comments on commit 405b700

Please sign in to comment.