Skip to content

Commit

Permalink
[amdgpu] Part3 update runtime module (taichi-dev#6486)
Browse files Browse the repository at this point in the history
Issue: taichi-dev#6434

### Brief Summary
1. This is a special part of the Tacihi runtime module for the `AMDGPU`
backend. Tacihi's runtime module uses `clang++` to generate `LLVM IR` is
different in memory allocation differs from the cpu-generated `LLVM IR`.
The following is an example.
```
C/C++ code
void func(int *a, int *b) {
    *a = *b;
}
x86_64 backend LLVM IR
define dso_local void @cpu_func(i32* %0, i32* %1) taichi-dev#2 {
  %3 = alloca i32*, align 8
  %4 = alloca i32*, align 8
  store i32* %0, i32** %3, align 8
  store i32* %1, i32** %4, align 8
  %5 = load i32*, i32** %4, align 8
  %6 = load i32, i32* %5, align 4
  %7 = load i32*, i32** %3, align 8
  store i32 %6, i32* %7, align 4
  ret void
}
__global__ function on AMDGPU
define protected amdgpu_kernel void @global_func(i32 addrspace(1)* %0, i32 addrspace(1)* %1) taichi-dev#4 {
  %3 = alloca i32*, align 8, addrspace(5)
  %4 = alloca i32*, align 8, addrspace(5)
  %5 = alloca i32*, align 8, addrspace(5)
  %6 = alloca i32*, align 8, addrspace(5)
  %7 = addrspacecast i32* addrspace(5)* %3 to i32**
  %8 = addrspacecast i32* addrspace(5)* %4 to i32**
  %9 = addrspacecast i32* addrspace(5)* %5 to i32**
  %10 = addrspacecast i32* addrspace(5)* %6 to i32**
  %11 = addrspacecast i32 addrspace(1)* %0 to i32*
  store i32* %11, i32** %7, align 8
  %12 = load i32*, i32** %7, align 8
  %13 = addrspacecast i32 addrspace(1)* %1 to i32*
  store i32* %13, i32** %8, align 8
  %14 = load i32*, i32** %8, align 8
  store i32* %12, i32** %9, align 8
  store i32* %14, i32** %10, align 8
  %15 = load i32*, i32** %10, align 8
  %16 = load i32, i32* %15, align 4
  %17 = load i32*, i32** %9, align 8
  store i32 %16, i32* %17, align 4
  ret void
}
__device__ function on AMDGPU
define hidden void @device_func(i32* %0, i32* %1) taichi-dev#2 {
  %3 = alloca i32*, align 8, addrspace(5)
  %4 = alloca i32*, align 8, addrspace(5)
  %5 = addrspacecast i32* addrspace(5)* %3 to i32**
  %6 = addrspacecast i32* addrspace(5)* %4 to i32**
  store i32* %0, i32** %5, align 8
  store i32* %1, i32** %6, align 8
  %7 = load i32*, i32** %6, align 8
  %8 = load i32, i32* %7, align 4
  %9 = load i32*, i32** %5, align 8
  store i32 %8, i32* %9, align 4
  ret void
}
```
2. There are some differences in the place about `allocainst`,
specifically about addrspace (for `AMDGPU`,
[this](https://llvm.org/docs/AMDGPUUsage.html#address-spaces) will be
helpful). I have not found documentation describing how to write the
correct `LLVM IR` on `AMDGPU`, through my observation of the `LLVM IR`
generated by `clang++/hipcc`. We need to deal with the arguments of the
`__global__` function and the `allocainst` (including specifying the
addrspace of `allocainst` and performing addrspace-cast) while for the
`__device__` function we do not need to deal with the arguments of the
function.

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
2 people authored and feisuzhu committed Jan 5, 2023
1 parent bdbe7d0 commit f92eb64
Show file tree
Hide file tree
Showing 5 changed files with 282 additions and 94 deletions.
218 changes: 124 additions & 94 deletions taichi/runtime/llvm/llvm_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#ifdef TI_WITH_AMDGPU
#include "llvm/IR/IntrinsicsAMDGPU.h"
#endif // TI_WITH_AMDGPU
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
Expand Down Expand Up @@ -43,6 +46,9 @@
#include "llvm_context.h"
#include "taichi/runtime/program_impls/llvm/llvm_program.h"
#include "taichi/codegen/codegen_utils.h"
#ifdef TI_WITH_AMDGPU
#include "taichi/runtime/llvm/llvm_context_pass.h"
#endif

#ifdef _WIN32
// Travis CI seems doesn't support <filesystem>...
Expand Down Expand Up @@ -331,22 +337,7 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
auto ctx = get_this_thread_context();
std::unique_ptr<llvm::Module> module = module_from_bitcode_file(
fmt::format("{}/{}", runtime_lib_dir(), file), ctx);
if (arch_ == Arch::cuda) {
module->setTargetTriple("nvptx64-nvidia-cuda");

#if defined(TI_WITH_CUDA)
auto func = module->getFunction("cuda_compute_capability");
if (func) {
func->deleteBody();
auto bb = llvm::BasicBlock::Create(*ctx, "entry", func);
IRBuilder<> builder(*ctx);
builder.SetInsertPoint(bb);
builder.CreateRet(
get_constant(CUDAContext::get_instance().get_compute_capability()));
TaichiLLVMContext::mark_inline(func);
}
#endif

if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) {
auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin,
bool ret = true,
std::vector<llvm::Type *> types = {},
Expand Down Expand Up @@ -391,93 +382,124 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
TaichiLLVMContext::mark_inline(func);
};

patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x);
patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64);
patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x);
patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false);
patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false);
patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false);
patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false);

patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all);
patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync);

patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any);
patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync);

patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni);
patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync);

patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot);
patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync);

patch_intrinsic("cuda_shfl_down_sync_i32",
Intrinsic::nvvm_shfl_sync_down_i32);
patch_intrinsic("cuda_shfl_down_sync_f32",
Intrinsic::nvvm_shfl_sync_down_f32);

patch_intrinsic("cuda_shfl_up_sync_i32", Intrinsic::nvvm_shfl_sync_up_i32);
patch_intrinsic("cuda_shfl_up_sync_f32", Intrinsic::nvvm_shfl_sync_up_f32);

patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32);

patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32);

patch_intrinsic("cuda_shfl_xor_sync_i32",
Intrinsic::nvvm_shfl_sync_bfly_i32);

patch_intrinsic("cuda_match_any_sync_i32",
Intrinsic::nvvm_match_any_sync_i32);

// LLVM 10.0.0 seems to have a bug on this intrinsic function
/*
nvvm_match_all_sync_i32
Args:
1. u32 mask
2. i32 value
3. i32 *pred
*/
/*
patch_intrinsic("cuda_match_all_sync_i32p",
Intrinsic::nvvm_math_all_sync_i32);
*/

// LLVM 10.0.0 seems to have a bug on this intrinsic function
/*
patch_intrinsic("cuda_match_any_sync_i64",
Intrinsic::nvvm_match_any_sync_i64);
*/

patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true,
{llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
patch_intrinsic("cttz_i32", Intrinsic::cttz, true,
{llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});

patch_atomic_add("atomic_add_i32", llvm::AtomicRMWInst::Add);

patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add);

patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);

patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd);
patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);

patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
if (arch_ == Arch::cuda) {
module->setTargetTriple("nvptx64-nvidia-cuda");

link_module_with_cuda_libdevice(module);
#if defined(TI_WITH_CUDA)
auto func = module->getFunction("cuda_compute_capability");
if (func) {
func->deleteBody();
auto bb = llvm::BasicBlock::Create(*ctx, "entry", func);
IRBuilder<> builder(*ctx);
builder.SetInsertPoint(bb);
builder.CreateRet(
get_constant(CUDAContext::get_instance().get_compute_capability()));
TaichiLLVMContext::mark_inline(func);
}
#endif

// To prevent potential symbol name conflicts, we use "cuda_vprintf"
// instead of "vprintf" in llvm/runtime.cpp. Now we change it back for
// linking
for (auto &f : *module) {
if (f.getName() == "cuda_vprintf") {
f.setName("vprintf");
patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x);
patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64);
patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x);
patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false);
patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false);
patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false);
patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false);

patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all);
patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync);

patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any);
patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync);

patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni);
patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync);

patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot);
patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync);

patch_intrinsic("cuda_shfl_down_sync_i32",
Intrinsic::nvvm_shfl_sync_down_i32);
patch_intrinsic("cuda_shfl_down_sync_f32",
Intrinsic::nvvm_shfl_sync_down_f32);

patch_intrinsic("cuda_shfl_up_sync_i32",
Intrinsic::nvvm_shfl_sync_up_i32);
patch_intrinsic("cuda_shfl_up_sync_f32",
Intrinsic::nvvm_shfl_sync_up_f32);

patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32);

patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32);

patch_intrinsic("cuda_shfl_xor_sync_i32",
Intrinsic::nvvm_shfl_sync_bfly_i32);

patch_intrinsic("cuda_match_any_sync_i32",
Intrinsic::nvvm_match_any_sync_i32);

// LLVM 10.0.0 seems to have a bug on this intrinsic function
/*
nvvm_match_all_sync_i32
Args:
1. u32 mask
2. i32 value
3. i32 *pred
*/
/*
patch_intrinsic("cuda_match_all_sync_i32p",
Intrinsic::nvvm_math_all_sync_i32);
*/

// LLVM 10.0.0 seems to have a bug on this intrinsic function
/*
patch_intrinsic("cuda_match_any_sync_i64",
Intrinsic::nvvm_match_any_sync_i64);
*/

patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true,
{llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
patch_intrinsic("cttz_i32", Intrinsic::cttz, true,
{llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});

patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);

link_module_with_cuda_libdevice(module);

// To prevent potential symbol name conflicts, we use "cuda_vprintf"
// instead of "vprintf" in llvm/runtime.cpp. Now we change it back for
// linking
for (auto &f : *module) {
if (f.getName() == "cuda_vprintf") {
f.setName("vprintf");
}
}

// runtime_module->print(llvm::errs(), nullptr);
}

// runtime_module->print(llvm::errs(), nullptr);
if (arch_ == Arch::amdgpu) {
module->setTargetTriple("amdgcn-amd-amdhsa");
#ifdef TI_WITH_AMDGPU
llvm::legacy::FunctionPassManager function_pass_manager(module.get());
function_pass_manager.add(new AMDGPUConvertAllocaInstAddressSpacePass());
function_pass_manager.doInitialization();
for (auto func = module->begin(); func != module->end(); ++func) {
function_pass_manager.run(*func);
}
function_pass_manager.doFinalization();
patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x);
patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x);
#endif
}
}

return module;
Expand Down Expand Up @@ -796,6 +818,14 @@ void TaichiLLVMContext::update_runtime_jit_module(
}
}

if (arch_ == Arch::amdgpu) {
#ifdef TI_WITH_AMDGPU
llvm::legacy::PassManager module_pass_manager;
module_pass_manager.add(new AMDGPUConvertFuncParamAddressSpacePass());
module_pass_manager.run(*module);
#endif
}

eliminate_unused_functions(module.get(), [](std::string func_name) {
return starts_with(func_name, "runtime_") ||
starts_with(func_name, "LLVMRuntime_");
Expand Down
135 changes: 135 additions & 0 deletions taichi/runtime/llvm/llvm_context_pass.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#pragma once

#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Function.h"
#include "llvm/Pass.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IRBuilder.h"

#if defined(TI_WITH_AMDGPU)
#include "taichi/rhi/amdgpu/amdgpu_context.h"
#endif

namespace taichi {
namespace lang {
using namespace llvm;
#if defined(TI_WITH_AMDGPU)
struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass {
static char ID;
AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) {
}
bool runOnFunction(llvm::Function &f) override {
f.addFnAttr("target-cpu",
"gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4));
f.addFnAttr("target-features", "");
for (auto &bb : f) {
std::vector<AllocaInst *> alloca_inst_vec;
for (Instruction &inst : bb) {
AllocaInst *now_alloca = dyn_cast<AllocaInst>(&inst);
if (!now_alloca ||
now_alloca->getType()->getAddressSpace() != (unsigned)0) {
continue;
}
alloca_inst_vec.push_back(now_alloca);
}
for (auto &allocainst : alloca_inst_vec) {
auto alloca_type = allocainst->getAllocatedType();
IRBuilder<> builder(allocainst);
auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
new_alloca->setAlignment(Align(allocainst->getAlign().value()));
auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type);
allocainst->replaceAllUsesWith(addrspacecast);
allocainst->eraseFromParent();
}
}
return false;
}
};

struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
static char ID;
AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) {
}
bool runOnModule(llvm::Module &M) override {
for (auto &f : M) {
bool is_kernel = false;
const std::string func_name = f.getName().str();
if (starts_with(func_name, "runtime_")) {
f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
// ref https://llvm.org/docs/AMDGPUUsage.html
// “amdgpu-flat-work-group-size”=”min,max”
// Specify the minimum and maximum flat work group sizes that will be
// specified when the kernel is dispatched. Generated by the
// amdgpu_flat_work_group_size CLANG attribute [CLANG-ATTR]. The implied
// default value is 1,1024.
f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
is_kernel = true;
}
if (!is_kernel && !f.isDeclaration())
f.setLinkage(llvm::Function::PrivateLinkage);
}
std::vector<llvm::Function *> kernel_function;
for (auto &f : M) {
if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL)
kernel_function.push_back(&f);
}
for (auto &f : kernel_function) {
llvm::FunctionType *func_type = f->getFunctionType();
std::vector<llvm::Type *> new_func_params;
for (auto &arg : f->args()) {
if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
auto new_type = llvm::PointerType::get(
arg.getType()->getPointerElementType(), unsigned(1));
new_func_params.push_back(new_type);
} else {
new_func_params.push_back(arg.getType());
}
}
auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(),
new_func_params, false);
auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(),
f->getAddressSpace());
new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
new_func->addFnAttr(
"target-cpu",
"gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4));
new_func->setComdat(f->getComdat());
f->getParent()->getFunctionList().insert(f->getIterator(), new_func);
new_func->takeName(f);
new_func->getBasicBlockList().splice(new_func->begin(),
f->getBasicBlockList());
for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
I2 = new_func->arg_begin();
I != E; ++I, ++I2) {
if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
auto &front_bb = new_func->getBasicBlockList().front();
llvm::Instruction *addrspacecast =
new AddrSpaceCastInst(I2, I->getType());
front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(),
addrspacecast);
I->replaceAllUsesWith(addrspacecast);
I2->takeName(&*I);
} else {
I->replaceAllUsesWith(&*I2);
I2->takeName(&*I);
}
}

f->eraseFromParent();
}
return false;
}
};

char AMDGPUConvertAllocaInstAddressSpacePass::ID = 0;
char AMDGPUConvertFuncParamAddressSpacePass::ID = 0;
#endif

} // namespace lang
} // namespace taichi
Loading

0 comments on commit f92eb64

Please sign in to comment.