diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index 95b129ef2a7e8..31ee42da2791f 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -15,6 +15,9 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsNVPTX.h" +#ifdef TI_WITH_AMDGPU +#include "llvm/IR/IntrinsicsAMDGPU.h" +#endif // TI_WITH_AMDGPU #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" @@ -47,6 +50,9 @@ #include "llvm_context.h" #include "taichi/runtime/program_impls/llvm/llvm_program.h" #include "taichi/codegen/codegen_utils.h" +#ifdef TI_WITH_AMDGPU +#include "taichi/runtime/llvm/llvm_context_pass.h" +#endif #ifdef _WIN32 // Travis CI seems doesn't support ... @@ -334,22 +340,7 @@ std::unique_ptr TaichiLLVMContext::module_from_file( auto ctx = get_this_thread_context(); std::unique_ptr module = module_from_bitcode_file( fmt::format("{}/{}", runtime_lib_dir(), file), ctx); - if (arch_ == Arch::cuda) { - module->setTargetTriple("nvptx64-nvidia-cuda"); - -#if defined(TI_WITH_CUDA) - auto func = module->getFunction("cuda_compute_capability"); - if (func) { - func->deleteBody(); - auto bb = llvm::BasicBlock::Create(*ctx, "entry", func); - IRBuilder<> builder(*ctx); - builder.SetInsertPoint(bb); - builder.CreateRet( - get_constant(CUDAContext::get_instance().get_compute_capability())); - TaichiLLVMContext::mark_inline(func); - } -#endif - + if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) { auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin, bool ret = true, std::vector types = {}, @@ -399,93 +390,124 @@ std::unique_ptr TaichiLLVMContext::module_from_file( TaichiLLVMContext::mark_inline(func); }; - patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x); - patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64); - patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x); - patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x); - patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x); - patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false); - patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false); - patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false); - patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false); - patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false); - - patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all); - patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync); - - patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any); - patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync); - - patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni); - patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync); - - patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot); - patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync); - - patch_intrinsic("cuda_shfl_down_sync_i32", - Intrinsic::nvvm_shfl_sync_down_i32); - patch_intrinsic("cuda_shfl_down_sync_f32", - Intrinsic::nvvm_shfl_sync_down_f32); - - patch_intrinsic("cuda_shfl_up_sync_i32", Intrinsic::nvvm_shfl_sync_up_i32); - patch_intrinsic("cuda_shfl_up_sync_f32", Intrinsic::nvvm_shfl_sync_up_f32); - - patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32); - - patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32); - - patch_intrinsic("cuda_shfl_xor_sync_i32", - Intrinsic::nvvm_shfl_sync_bfly_i32); - - patch_intrinsic("cuda_match_any_sync_i32", - Intrinsic::nvvm_match_any_sync_i32); - - // LLVM 10.0.0 seems to have a bug on this intrinsic function - /* - nvvm_match_all_sync_i32 - Args: - 1. u32 mask - 2. i32 value - 3. i32 *pred - */ - /* - patch_intrinsic("cuda_match_all_sync_i32p", - Intrinsic::nvvm_math_all_sync_i32); - */ - - // LLVM 10.0.0 seems to have a bug on this intrinsic function - /* - patch_intrinsic("cuda_match_any_sync_i64", - Intrinsic::nvvm_match_any_sync_i64); - */ - - patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true, - {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)}); - patch_intrinsic("cttz_i32", Intrinsic::cttz, true, - {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)}); - patch_atomic_add("atomic_add_i32", llvm::AtomicRMWInst::Add); - patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add); - - patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd); - patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd); + patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd); - patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false); + if (arch_ == Arch::cuda) { + module->setTargetTriple("nvptx64-nvidia-cuda"); - link_module_with_cuda_libdevice(module); +#if defined(TI_WITH_CUDA) + auto func = module->getFunction("cuda_compute_capability"); + if (func) { + func->deleteBody(); + auto bb = llvm::BasicBlock::Create(*ctx, "entry", func); + IRBuilder<> builder(*ctx); + builder.SetInsertPoint(bb); + builder.CreateRet( + get_constant(CUDAContext::get_instance().get_compute_capability())); + TaichiLLVMContext::mark_inline(func); + } +#endif - // To prevent potential symbol name conflicts, we use "cuda_vprintf" - // instead of "vprintf" in llvm/runtime.cpp. Now we change it back for - // linking - for (auto &f : *module) { - if (f.getName() == "cuda_vprintf") { - f.setName("vprintf"); + patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x); + patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64); + patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x); + patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x); + patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x); + patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false); + patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false); + patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false); + patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false); + patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false); + + patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all); + patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync); + + patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any); + patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync); + + patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni); + patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync); + + patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot); + patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync); + + patch_intrinsic("cuda_shfl_down_sync_i32", + Intrinsic::nvvm_shfl_sync_down_i32); + patch_intrinsic("cuda_shfl_down_sync_f32", + Intrinsic::nvvm_shfl_sync_down_f32); + + patch_intrinsic("cuda_shfl_up_sync_i32", + Intrinsic::nvvm_shfl_sync_up_i32); + patch_intrinsic("cuda_shfl_up_sync_f32", + Intrinsic::nvvm_shfl_sync_up_f32); + + patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32); + + patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32); + + patch_intrinsic("cuda_shfl_xor_sync_i32", + Intrinsic::nvvm_shfl_sync_bfly_i32); + + patch_intrinsic("cuda_match_any_sync_i32", + Intrinsic::nvvm_match_any_sync_i32); + + // LLVM 10.0.0 seems to have a bug on this intrinsic function + /* + nvvm_match_all_sync_i32 + Args: + 1. u32 mask + 2. i32 value + 3. i32 *pred + */ + /* + patch_intrinsic("cuda_match_all_sync_i32p", + Intrinsic::nvvm_math_all_sync_i32); + */ + + // LLVM 10.0.0 seems to have a bug on this intrinsic function + /* + patch_intrinsic("cuda_match_any_sync_i64", + Intrinsic::nvvm_match_any_sync_i64); + */ + + patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true, + {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)}); + patch_intrinsic("cttz_i32", Intrinsic::cttz, true, + {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)}); + + patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false); + + link_module_with_cuda_libdevice(module); + + // To prevent potential symbol name conflicts, we use "cuda_vprintf" + // instead of "vprintf" in llvm/runtime.cpp. Now we change it back for + // linking + for (auto &f : *module) { + if (f.getName() == "cuda_vprintf") { + f.setName("vprintf"); + } } + + // runtime_module->print(llvm::errs(), nullptr); } - // runtime_module->print(llvm::errs(), nullptr); + if (arch_ == Arch::amdgpu) { + module->setTargetTriple("amdgcn-amd-amdhsa"); +#ifdef TI_WITH_AMDGPU + llvm::legacy::FunctionPassManager function_pass_manager(module.get()); + function_pass_manager.add(new AMDGPUConvertAllocaInstAddressSpacePass()); + function_pass_manager.doInitialization(); + for (auto func = module->begin(); func != module->end(); ++func) { + function_pass_manager.run(*func); + } + function_pass_manager.doFinalization(); + patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x); + patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x); +#endif + } } return module; @@ -817,6 +839,14 @@ void TaichiLLVMContext::update_runtime_jit_module( } } + if (arch_ == Arch::amdgpu) { +#ifdef TI_WITH_AMDGPU + llvm::legacy::PassManager module_pass_manager; + module_pass_manager.add(new AMDGPUConvertFuncParamAddressSpacePass()); + module_pass_manager.run(*module); +#endif + } + eliminate_unused_functions(module.get(), [](std::string func_name) { return starts_with(func_name, "runtime_") || starts_with(func_name, "LLVMRuntime_"); diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h new file mode 100644 index 0000000000000..d48303dbc6670 --- /dev/null +++ b/taichi/runtime/llvm/llvm_context_pass.h @@ -0,0 +1,135 @@ +#pragma once + +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" + +#if defined(TI_WITH_AMDGPU) +#include "taichi/rhi/amdgpu/amdgpu_context.h" +#endif + +namespace taichi { +namespace lang { +using namespace llvm; +#if defined(TI_WITH_AMDGPU) +struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass { + static char ID; + AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) { + } + bool runOnFunction(llvm::Function &f) override { + f.addFnAttr("target-cpu", + "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4)); + f.addFnAttr("target-features", ""); + for (auto &bb : f) { + std::vector alloca_inst_vec; + for (Instruction &inst : bb) { + AllocaInst *now_alloca = dyn_cast(&inst); + if (!now_alloca || + now_alloca->getType()->getAddressSpace() != (unsigned)0) { + continue; + } + alloca_inst_vec.push_back(now_alloca); + } + for (auto &allocainst : alloca_inst_vec) { + auto alloca_type = allocainst->getAllocatedType(); + IRBuilder<> builder(allocainst); + auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5); + auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0); + new_alloca->setAlignment(Align(allocainst->getAlign().value())); + auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type); + allocainst->replaceAllUsesWith(addrspacecast); + allocainst->eraseFromParent(); + } + } + return false; + } +}; + +struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass { + static char ID; + AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) { + } + bool runOnModule(llvm::Module &M) override { + for (auto &f : M) { + bool is_kernel = false; + const std::string func_name = f.getName().str(); + if (starts_with(func_name, "runtime_")) { + f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); + // ref https://llvm.org/docs/AMDGPUUsage.html + // “amdgpu-flat-work-group-size”=”min,max” + // Specify the minimum and maximum flat work group sizes that will be + // specified when the kernel is dispatched. Generated by the + // amdgpu_flat_work_group_size CLANG attribute [CLANG-ATTR]. The implied + // default value is 1,1024. + f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); + is_kernel = true; + } + if (!is_kernel && !f.isDeclaration()) + f.setLinkage(llvm::Function::PrivateLinkage); + } + std::vector kernel_function; + for (auto &f : M) { + if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL) + kernel_function.push_back(&f); + } + for (auto &f : kernel_function) { + llvm::FunctionType *func_type = f->getFunctionType(); + std::vector new_func_params; + for (auto &arg : f->args()) { + if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) { + auto new_type = llvm::PointerType::get( + arg.getType()->getPointerElementType(), unsigned(1)); + new_func_params.push_back(new_type); + } else { + new_func_params.push_back(arg.getType()); + } + } + auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), + new_func_params, false); + auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), + f->getAddressSpace()); + new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); + new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); + new_func->addFnAttr( + "target-cpu", + "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4)); + new_func->setComdat(f->getComdat()); + f->getParent()->getFunctionList().insert(f->getIterator(), new_func); + new_func->takeName(f); + new_func->getBasicBlockList().splice(new_func->begin(), + f->getBasicBlockList()); + for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(), + I2 = new_func->arg_begin(); + I != E; ++I, ++I2) { + if (I->getType()->getTypeID() == llvm::Type::PointerTyID) { + auto &front_bb = new_func->getBasicBlockList().front(); + llvm::Instruction *addrspacecast = + new AddrSpaceCastInst(I2, I->getType()); + front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), + addrspacecast); + I->replaceAllUsesWith(addrspacecast); + I2->takeName(&*I); + } else { + I->replaceAllUsesWith(&*I2); + I2->takeName(&*I); + } + } + + f->eraseFromParent(); + } + return false; + } +}; + +char AMDGPUConvertAllocaInstAddressSpacePass::ID = 0; +char AMDGPUConvertFuncParamAddressSpacePass::ID = 0; +#endif + +} // namespace lang +} // namespace taichi diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp index 02a7148084b3b..1fbddac1b3a05 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.cpp +++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp @@ -112,6 +112,15 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, } #endif +#if defined(TI_WITH_AMDGPU) + if (config.arch == Arch::amdgpu) { + AMDGPUContext::get_instance().set_debug(config.debug); + device_ = std::make_shared(); + + this->maybe_initialize_amdgpu_llvm_context(); + } +#endif + #ifdef TI_WITH_DX12 if (config.arch == Arch::dx12) { // FIXME: add dx12 device. @@ -149,6 +158,14 @@ void LlvmRuntimeExecutor::maybe_initialize_cuda_llvm_context() { } } +void LlvmRuntimeExecutor::maybe_initialize_amdgpu_llvm_context() { + if (config_->arch == Arch::amdgpu && llvm_context_device_ == nullptr) { + llvm_context_device_ = + std::make_unique(config_, Arch::amdgpu); + llvm_context_device_->init_runtime_jit_module(); + } +} + void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager, uint64 *result_buffer) { auto list_manager_len = runtime_query("ListManager_get_num_elements", diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h index 85cc3def851e0..f9f5a6e6cc7cb 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.h +++ b/taichi/runtime/llvm/llvm_runtime_executor.h @@ -129,6 +129,8 @@ class LlvmRuntimeExecutor { */ void maybe_initialize_cuda_llvm_context(); + void maybe_initialize_amdgpu_llvm_context(); + void finalize(); uint64 fetch_result_uint64(int i, uint64 *result_buffer); diff --git a/taichi/runtime/program_impls/llvm/llvm_program.h b/taichi/runtime/program_impls/llvm/llvm_program.h index 5e16dc3ea57f0..a8a71b06c7853 100644 --- a/taichi/runtime/program_impls/llvm/llvm_program.h +++ b/taichi/runtime/program_impls/llvm/llvm_program.h @@ -159,6 +159,10 @@ class LlvmProgramImpl : public ProgramImpl { runtime_exec_->maybe_initialize_cuda_llvm_context(); } + void maybe_initialize_amdgpu_llvm_context() { + runtime_exec_->maybe_initialize_amdgpu_llvm_context(); + } + uint64 fetch_result_uint64(int i, uint64 *result_buffer) override { return runtime_exec_->fetch_result_uint64(i, result_buffer); }