Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[amdgpu] Part3 update runtime module #6486

Merged
merged 12 commits into from
Dec 30, 2022
218 changes: 124 additions & 94 deletions taichi/runtime/llvm/llvm_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsNVPTX.h"
#ifdef TI_WITH_AMDGPU
#include "llvm/IR/IntrinsicsAMDGPU.h"
#endif // TI_WITH_AMDGPU
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
Expand Down Expand Up @@ -47,6 +50,9 @@
#include "llvm_context.h"
#include "taichi/runtime/program_impls/llvm/llvm_program.h"
#include "taichi/codegen/codegen_utils.h"
#ifdef TI_WITH_AMDGPU
#include "taichi/runtime/llvm/llvm_context_pass.h"
#endif

#ifdef _WIN32
// Travis CI seems doesn't support <filesystem>...
Expand Down Expand Up @@ -334,22 +340,7 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
auto ctx = get_this_thread_context();
std::unique_ptr<llvm::Module> module = module_from_bitcode_file(
fmt::format("{}/{}", runtime_lib_dir(), file), ctx);
if (arch_ == Arch::cuda) {
module->setTargetTriple("nvptx64-nvidia-cuda");

#if defined(TI_WITH_CUDA)
auto func = module->getFunction("cuda_compute_capability");
if (func) {
func->deleteBody();
auto bb = llvm::BasicBlock::Create(*ctx, "entry", func);
IRBuilder<> builder(*ctx);
builder.SetInsertPoint(bb);
builder.CreateRet(
get_constant(CUDAContext::get_instance().get_compute_capability()));
TaichiLLVMContext::mark_inline(func);
}
#endif

if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) {
auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin,
bool ret = true,
std::vector<llvm::Type *> types = {},
Expand Down Expand Up @@ -399,93 +390,124 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
TaichiLLVMContext::mark_inline(func);
};

patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x);
patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64);
patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x);
patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false);
patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false);
patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false);
patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false);

patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all);
patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync);

patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any);
patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync);

patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni);
patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync);

patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot);
patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync);

patch_intrinsic("cuda_shfl_down_sync_i32",
Intrinsic::nvvm_shfl_sync_down_i32);
patch_intrinsic("cuda_shfl_down_sync_f32",
Intrinsic::nvvm_shfl_sync_down_f32);

patch_intrinsic("cuda_shfl_up_sync_i32", Intrinsic::nvvm_shfl_sync_up_i32);
patch_intrinsic("cuda_shfl_up_sync_f32", Intrinsic::nvvm_shfl_sync_up_f32);

patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32);

patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32);

patch_intrinsic("cuda_shfl_xor_sync_i32",
Intrinsic::nvvm_shfl_sync_bfly_i32);

patch_intrinsic("cuda_match_any_sync_i32",
Intrinsic::nvvm_match_any_sync_i32);

// LLVM 10.0.0 seems to have a bug on this intrinsic function
/*
nvvm_match_all_sync_i32
Args:
1. u32 mask
2. i32 value
3. i32 *pred
*/
/*
patch_intrinsic("cuda_match_all_sync_i32p",
Intrinsic::nvvm_math_all_sync_i32);
*/

// LLVM 10.0.0 seems to have a bug on this intrinsic function
/*
patch_intrinsic("cuda_match_any_sync_i64",
Intrinsic::nvvm_match_any_sync_i64);
*/

patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true,
{llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
patch_intrinsic("cttz_i32", Intrinsic::cttz, true,
{llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});

patch_atomic_add("atomic_add_i32", llvm::AtomicRMWInst::Add);

patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add);

patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);

patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd);
patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);

patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
if (arch_ == Arch::cuda) {
module->setTargetTriple("nvptx64-nvidia-cuda");

link_module_with_cuda_libdevice(module);
#if defined(TI_WITH_CUDA)
auto func = module->getFunction("cuda_compute_capability");
if (func) {
func->deleteBody();
auto bb = llvm::BasicBlock::Create(*ctx, "entry", func);
IRBuilder<> builder(*ctx);
builder.SetInsertPoint(bb);
builder.CreateRet(
get_constant(CUDAContext::get_instance().get_compute_capability()));
TaichiLLVMContext::mark_inline(func);
}
#endif

// To prevent potential symbol name conflicts, we use "cuda_vprintf"
// instead of "vprintf" in llvm/runtime.cpp. Now we change it back for
// linking
for (auto &f : *module) {
if (f.getName() == "cuda_vprintf") {
f.setName("vprintf");
patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x);
patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64);
patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x);
patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false);
patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false);
patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false);
patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false);

patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all);
patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync);

patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any);
patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync);

patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni);
patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync);

patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot);
patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync);

patch_intrinsic("cuda_shfl_down_sync_i32",
Intrinsic::nvvm_shfl_sync_down_i32);
patch_intrinsic("cuda_shfl_down_sync_f32",
Intrinsic::nvvm_shfl_sync_down_f32);

patch_intrinsic("cuda_shfl_up_sync_i32",
Intrinsic::nvvm_shfl_sync_up_i32);
patch_intrinsic("cuda_shfl_up_sync_f32",
Intrinsic::nvvm_shfl_sync_up_f32);

patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32);

patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32);

patch_intrinsic("cuda_shfl_xor_sync_i32",
Intrinsic::nvvm_shfl_sync_bfly_i32);

patch_intrinsic("cuda_match_any_sync_i32",
Intrinsic::nvvm_match_any_sync_i32);

// LLVM 10.0.0 seems to have a bug on this intrinsic function
/*
nvvm_match_all_sync_i32
Args:
1. u32 mask
2. i32 value
3. i32 *pred
*/
/*
patch_intrinsic("cuda_match_all_sync_i32p",
Intrinsic::nvvm_math_all_sync_i32);
*/

// LLVM 10.0.0 seems to have a bug on this intrinsic function
/*
patch_intrinsic("cuda_match_any_sync_i64",
Intrinsic::nvvm_match_any_sync_i64);
*/

patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true,
{llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
patch_intrinsic("cttz_i32", Intrinsic::cttz, true,
{llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});

patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);

link_module_with_cuda_libdevice(module);

// To prevent potential symbol name conflicts, we use "cuda_vprintf"
// instead of "vprintf" in llvm/runtime.cpp. Now we change it back for
// linking
for (auto &f : *module) {
if (f.getName() == "cuda_vprintf") {
f.setName("vprintf");
}
}

// runtime_module->print(llvm::errs(), nullptr);
}

// runtime_module->print(llvm::errs(), nullptr);
if (arch_ == Arch::amdgpu) {
module->setTargetTriple("amdgcn-amd-amdhsa");
#ifdef TI_WITH_AMDGPU
llvm::legacy::FunctionPassManager function_pass_manager(module.get());
function_pass_manager.add(new AMDGPUConvertAllocaInstAddressSpacePass());
function_pass_manager.doInitialization();
for (auto func = module->begin(); func != module->end(); ++func) {
function_pass_manager.run(*func);
}
function_pass_manager.doFinalization();
patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x);
patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x);
#endif
}
}

return module;
Expand Down Expand Up @@ -817,6 +839,14 @@ void TaichiLLVMContext::update_runtime_jit_module(
}
}

if (arch_ == Arch::amdgpu) {
#ifdef TI_WITH_AMDGPU
llvm::legacy::PassManager module_pass_manager;
module_pass_manager.add(new AMDGPUConvertFuncParamAddressSpacePass());
module_pass_manager.run(*module);
#endif
}

eliminate_unused_functions(module.get(), [](std::string func_name) {
return starts_with(func_name, "runtime_") ||
starts_with(func_name, "LLVMRuntime_");
Expand Down
135 changes: 135 additions & 0 deletions taichi/runtime/llvm/llvm_context_pass.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
#pragma once

#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Function.h"
#include "llvm/Pass.h"
#include "llvm/IR/Module.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IRBuilder.h"

#if defined(TI_WITH_AMDGPU)
#include "taichi/rhi/amdgpu/amdgpu_context.h"
#endif

namespace taichi {
namespace lang {
using namespace llvm;
#if defined(TI_WITH_AMDGPU)
struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass {
static char ID;
AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) {
}
bool runOnFunction(llvm::Function &f) override {
f.addFnAttr("target-cpu",
"gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4));
f.addFnAttr("target-features", "");
for (auto &bb : f) {
std::vector<AllocaInst *> alloca_inst_vec;
for (Instruction &inst : bb) {
AllocaInst *now_alloca = dyn_cast<AllocaInst>(&inst);
if (!now_alloca ||
now_alloca->getType()->getAddressSpace() != (unsigned)0) {
continue;
}
alloca_inst_vec.push_back(now_alloca);
}
for (auto &allocainst : alloca_inst_vec) {
auto alloca_type = allocainst->getAllocatedType();
IRBuilder<> builder(allocainst);
auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
new_alloca->setAlignment(Align(allocainst->getAlign().value()));
auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type);
allocainst->replaceAllUsesWith(addrspacecast);
allocainst->eraseFromParent();
}
}
return false;
}
};

struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
static char ID;
AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) {
}
bool runOnModule(llvm::Module &M) override {
for (auto &f : M) {
bool is_kernel = false;
const std::string func_name = f.getName().str();
if (starts_with(func_name, "runtime_")) {
f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
// ref https://llvm.org/docs/AMDGPUUsage.html
// “amdgpu-flat-work-group-size”=”min,max”
// Specify the minimum and maximum flat work group sizes that will be
// specified when the kernel is dispatched. Generated by the
// amdgpu_flat_work_group_size CLANG attribute [CLANG-ATTR]. The implied
// default value is 1,1024.
f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
is_kernel = true;
}
if (!is_kernel && !f.isDeclaration())
f.setLinkage(llvm::Function::PrivateLinkage);
}
std::vector<llvm::Function *> kernel_function;
for (auto &f : M) {
if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL)
kernel_function.push_back(&f);
}
for (auto &f : kernel_function) {
llvm::FunctionType *func_type = f->getFunctionType();
std::vector<llvm::Type *> new_func_params;
for (auto &arg : f->args()) {
if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
auto new_type = llvm::PointerType::get(
arg.getType()->getPointerElementType(), unsigned(1));
new_func_params.push_back(new_type);
} else {
new_func_params.push_back(arg.getType());
}
}
auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(),
new_func_params, false);
auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(),
f->getAddressSpace());
new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
new_func->addFnAttr(
"target-cpu",
"gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4));
new_func->setComdat(f->getComdat());
f->getParent()->getFunctionList().insert(f->getIterator(), new_func);
new_func->takeName(f);
new_func->getBasicBlockList().splice(new_func->begin(),
f->getBasicBlockList());
for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
I2 = new_func->arg_begin();
I != E; ++I, ++I2) {
if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
auto &front_bb = new_func->getBasicBlockList().front();
llvm::Instruction *addrspacecast =
new AddrSpaceCastInst(I2, I->getType());
front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(),
addrspacecast);
I->replaceAllUsesWith(addrspacecast);
I2->takeName(&*I);
} else {
I->replaceAllUsesWith(&*I2);
I2->takeName(&*I);
}
}

f->eraseFromParent();
}
return false;
}
};

char AMDGPUConvertAllocaInstAddressSpacePass::ID = 0;
char AMDGPUConvertFuncParamAddressSpacePass::ID = 0;
#endif

} // namespace lang
} // namespace taichi
Loading