From e5a32a1fc8c376b9554e178dfa82fd6a45cae748 Mon Sep 17 00:00:00 2001 From: zeyuli Date: Mon, 31 Oct 2022 21:13:30 +0800 Subject: [PATCH 01/12] alter global function's arg address --- taichi/runtime/llvm/llvm_context.cpp | 65 ++++++++++++++++++++++++++++ taichi/runtime/llvm/llvm_context.h | 2 + 2 files changed, 67 insertions(+) diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index 95b129ef2a7e8..f8e3a1c8ee4f3 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -711,6 +711,11 @@ void TaichiLLVMContext::mark_function_as_cuda_kernel(llvm::Function *func, } } +void TaichiLLVMContext::mark_function_as_amdgpu_kernel(llvm::Function *func) { + func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); + func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); +} + void TaichiLLVMContext::eliminate_unused_functions( llvm::Module *module, std::function export_indicator) { @@ -817,6 +822,66 @@ void TaichiLLVMContext::update_runtime_jit_module( } } + if (arch_ == Arch::amdgpu) { + for (auto &f : *module) { + bool is_kernel = false; + const std::string func_name = f.getName().str(); + if (starts_with(func_name, "runtime_")) { + mark_function_as_amdgpu_kernel(&f); + is_kernel = true; + } + if (!is_kernel && !f.isDeclaration()) + f.setLinkage(llvm::Function::PrivateLinkage); + } + std::vector global_func; + for (auto &f : *module) { + if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL) + global_func.push_back(&f); + } + for (auto &f : global_func) { + llvm::FunctionType *func_type = f->getFunctionType(); + std::vector new_func_params; + for (auto &arg : f->args()) { + if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) { + auto new_type = llvm::PointerType::get(arg.getType()->getPointerElementType(), unsigned(1)); + new_func_params.push_back(new_type); + } + else { + new_func_params.push_back(arg.getType()); + } + } + auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), new_func_params, false); + auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), f->getAddressSpace()); + //NF->copyAttributesFrom(f); + new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); + new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); + new_func->setComdat(f->getComdat()); + f->getParent()->getFunctionList().insert(f->getIterator(), new_func); + new_func->takeName(f); + new_func->getBasicBlockList().splice(new_func->begin(), f->getBasicBlockList()); + for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(), + I2 = new_func->arg_begin(); I != E; ++I, ++I2) { + if (I->getType()->getTypeID() == llvm::Type::PointerTyID) { + auto &front_bb = new_func->getBasicBlockList().front(); + llvm::Instruction *addrspacecast = new AddrSpaceCastInst(I2, I->getType()); + front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), addrspacecast); + I->replaceAllUsesWith(addrspacecast); + I2->takeName(&*I); + } + else { + I->replaceAllUsesWith(&*I2); + I2->takeName(&*I); + } + } + + SmallVector, 1> MDs; + f->getAllMetadata(MDs); + for (auto [KindID, Node] : MDs) + new_func->addMetadata(KindID, *Node); + f->eraseFromParent(); + } + } + eliminate_unused_functions(module.get(), [](std::string func_name) { return starts_with(func_name, "runtime_") || starts_with(func_name, "LLVMRuntime_"); diff --git a/taichi/runtime/llvm/llvm_context.h b/taichi/runtime/llvm/llvm_context.h index ddea66efd763d..58ca21f68c0d6 100644 --- a/taichi/runtime/llvm/llvm_context.h +++ b/taichi/runtime/llvm/llvm_context.h @@ -125,6 +125,8 @@ class TaichiLLVMContext { void mark_function_as_cuda_kernel(llvm::Function *func, int block_dim = 0); + void mark_function_as_amdgpu_kernel(llvm::Function *func); + void fetch_this_thread_struct_module(); llvm::Module *get_this_thread_runtime_module(); llvm::Function *get_runtime_function(const std::string &name); From 8e91537d4de00056a46aeebf6723189300bd2cef Mon Sep 17 00:00:00 2001 From: zeyuli Date: Mon, 31 Oct 2022 21:22:45 +0800 Subject: [PATCH 02/12] add misc api --- taichi/runtime/llvm/llvm_runtime_executor.cpp | 17 +++++++++++++++++ taichi/runtime/llvm/llvm_runtime_executor.h | 2 ++ .../runtime/program_impls/llvm/llvm_program.h | 4 ++++ 3 files changed, 23 insertions(+) diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp index 02a7148084b3b..6d087696ef71b 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.cpp +++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp @@ -112,6 +112,15 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, } #endif +#if define(TI_WITH_AMDGPU) + if (config.arch == Arch::amdgpu) { + AMDGPUContext::get_instance().set_debug(config.debug); + device_ = std::make_shared(); + + this->maybe_initialize_amdgpu_llvm_context(); + } +#endif + #ifdef TI_WITH_DX12 if (config.arch == Arch::dx12) { // FIXME: add dx12 device. @@ -149,6 +158,14 @@ void LlvmRuntimeExecutor::maybe_initialize_cuda_llvm_context() { } } +void LlvmRuntimeExecutor::maybe_initialize_amdgpu_llvm_context() { + if (config_->arch == Arch::amdgpu && llvm_context_device_ == nullptr) { + llvm_context_device_ = + std::make_unique(config_, Arch::amdgpu); + llvm_context_device_->init_runtime_jit_module(); + } +} + void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager, uint64 *result_buffer) { auto list_manager_len = runtime_query("ListManager_get_num_elements", diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h index 85cc3def851e0..f9f5a6e6cc7cb 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.h +++ b/taichi/runtime/llvm/llvm_runtime_executor.h @@ -129,6 +129,8 @@ class LlvmRuntimeExecutor { */ void maybe_initialize_cuda_llvm_context(); + void maybe_initialize_amdgpu_llvm_context(); + void finalize(); uint64 fetch_result_uint64(int i, uint64 *result_buffer); diff --git a/taichi/runtime/program_impls/llvm/llvm_program.h b/taichi/runtime/program_impls/llvm/llvm_program.h index 5e16dc3ea57f0..a8a71b06c7853 100644 --- a/taichi/runtime/program_impls/llvm/llvm_program.h +++ b/taichi/runtime/program_impls/llvm/llvm_program.h @@ -159,6 +159,10 @@ class LlvmProgramImpl : public ProgramImpl { runtime_exec_->maybe_initialize_cuda_llvm_context(); } + void maybe_initialize_amdgpu_llvm_context() { + runtime_exec_->maybe_initialize_amdgpu_llvm_context(); + } + uint64 fetch_result_uint64(int i, uint64 *result_buffer) override { return runtime_exec_->fetch_result_uint64(i, result_buffer); } From 3546946e2b4e81c626c91fcb2ef4b97c4b9dc3c0 Mon Sep 17 00:00:00 2001 From: zeyuli Date: Mon, 31 Oct 2022 21:36:57 +0800 Subject: [PATCH 03/12] add update addrspace module --- taichi/runtime/llvm/llvm_context.cpp | 83 +++++++++++++++++++--------- 1 file changed, 58 insertions(+), 25 deletions(-) diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index f8e3a1c8ee4f3..b6057e96fe343 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -15,6 +15,9 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsNVPTX.h" +#ifdef TI_WITH_AMDGPU +#include "llvm/IR/IntrinsicsAMDGPU.h" +#endif //TI_WITH_AMDGPU #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" @@ -334,23 +337,8 @@ std::unique_ptr TaichiLLVMContext::module_from_file( auto ctx = get_this_thread_context(); std::unique_ptr module = module_from_bitcode_file( fmt::format("{}/{}", runtime_lib_dir(), file), ctx); - if (arch_ == Arch::cuda) { - module->setTargetTriple("nvptx64-nvidia-cuda"); - -#if defined(TI_WITH_CUDA) - auto func = module->getFunction("cuda_compute_capability"); - if (func) { - func->deleteBody(); - auto bb = llvm::BasicBlock::Create(*ctx, "entry", func); - IRBuilder<> builder(*ctx); - builder.SetInsertPoint(bb); - builder.CreateRet( - get_constant(CUDAContext::get_instance().get_compute_capability())); - TaichiLLVMContext::mark_inline(func); - } -#endif - - auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin, + if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) { + auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin, bool ret = true, std::vector types = {}, std::vector extra_args = {}) { @@ -399,6 +387,28 @@ std::unique_ptr TaichiLLVMContext::module_from_file( TaichiLLVMContext::mark_inline(func); }; + patch_atomic_add("atomic_add_i32", llvm::AtomicRMWInst::Add); + patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add); + patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd); + patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd); + } + + if (arch_ == Arch::cuda) { + module->setTargetTriple("nvptx64-nvidia-cuda"); + +#if defined(TI_WITH_CUDA) + auto func = module->getFunction("cuda_compute_capability"); + if (func) { + func->deleteBody(); + auto bb = llvm::BasicBlock::Create(*ctx, "entry", func); + IRBuilder<> builder(*ctx); + builder.SetInsertPoint(bb); + builder.CreateRet( + get_constant(CUDAContext::get_instance().get_compute_capability())); + TaichiLLVMContext::mark_inline(func); + } +#endif + patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x); patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64); patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x); @@ -464,14 +474,6 @@ std::unique_ptr TaichiLLVMContext::module_from_file( patch_intrinsic("cttz_i32", Intrinsic::cttz, true, {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)}); - patch_atomic_add("atomic_add_i32", llvm::AtomicRMWInst::Add); - - patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add); - - patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd); - - patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd); - patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false); link_module_with_cuda_libdevice(module); @@ -488,6 +490,37 @@ std::unique_ptr TaichiLLVMContext::module_from_file( // runtime_module->print(llvm::errs(), nullptr); } + if (arch_ == Arch::amdgpu) { + module->setTargetTriple("amdgcn-amd-amdhsa"); + for (auto &f : *module) { + f.addFnAttr("target-cpu",""); + f.addFnAttr("target-features",""); + for (auto &bb: f) { + std::vector alloca_inst_vec; + for (llvm::Instruction &inst : bb) { + llvm::AllocaInst* now_alloca = llvm::dyn_cast(&inst); + if (!now_alloca || + now_alloca->getType()->getAddressSpace() != (unsigned)0) { + continue; + } + alloca_inst_vec.push_back(now_alloca); + } + for (auto &allocainst : alloca_inst_vec) { + auto alloca_type = allocainst->getAllocatedType(); + llvm::IRBuilder<> builder(allocainst); + auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5); + auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0); + new_alloca->setAlignment(llvm::Align(allocainst->getAlignment())); + auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type); + allocainst->replaceAllUsesWith(addrspacecast); + allocainst->eraseFromParent(); + } + } + } + patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x); + patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x); + } + return module; } From a0ec473d6cd5ade8b1a98ddeac8a04a5afbc4953 Mon Sep 17 00:00:00 2001 From: zeyuli Date: Tue, 1 Nov 2022 00:45:36 +0800 Subject: [PATCH 04/12] fix scope --- taichi/runtime/llvm/llvm_context.cpp | 245 ++++++++++++++------------- 1 file changed, 123 insertions(+), 122 deletions(-) diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index b6057e96fe343..87f25433b15f4 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -391,134 +391,135 @@ std::unique_ptr TaichiLLVMContext::module_from_file( patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add); patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd); patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd); - } - - if (arch_ == Arch::cuda) { - module->setTargetTriple("nvptx64-nvidia-cuda"); - -#if defined(TI_WITH_CUDA) - auto func = module->getFunction("cuda_compute_capability"); - if (func) { - func->deleteBody(); - auto bb = llvm::BasicBlock::Create(*ctx, "entry", func); - IRBuilder<> builder(*ctx); - builder.SetInsertPoint(bb); - builder.CreateRet( - get_constant(CUDAContext::get_instance().get_compute_capability())); - TaichiLLVMContext::mark_inline(func); - } -#endif - - patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x); - patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64); - patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x); - patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x); - patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x); - patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false); - patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false); - patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false); - patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false); - patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false); - - patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all); - patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync); - - patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any); - patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync); - - patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni); - patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync); - - patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot); - patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync); - - patch_intrinsic("cuda_shfl_down_sync_i32", - Intrinsic::nvvm_shfl_sync_down_i32); - patch_intrinsic("cuda_shfl_down_sync_f32", - Intrinsic::nvvm_shfl_sync_down_f32); - - patch_intrinsic("cuda_shfl_up_sync_i32", Intrinsic::nvvm_shfl_sync_up_i32); - patch_intrinsic("cuda_shfl_up_sync_f32", Intrinsic::nvvm_shfl_sync_up_f32); - - patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32); - - patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32); - - patch_intrinsic("cuda_shfl_xor_sync_i32", - Intrinsic::nvvm_shfl_sync_bfly_i32); - - patch_intrinsic("cuda_match_any_sync_i32", - Intrinsic::nvvm_match_any_sync_i32); - - // LLVM 10.0.0 seems to have a bug on this intrinsic function - /* - nvvm_match_all_sync_i32 - Args: - 1. u32 mask - 2. i32 value - 3. i32 *pred - */ - /* - patch_intrinsic("cuda_match_all_sync_i32p", - Intrinsic::nvvm_math_all_sync_i32); - */ - - // LLVM 10.0.0 seems to have a bug on this intrinsic function - /* - patch_intrinsic("cuda_match_any_sync_i64", - Intrinsic::nvvm_match_any_sync_i64); - */ - - patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true, - {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)}); - patch_intrinsic("cttz_i32", Intrinsic::cttz, true, - {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)}); - - patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false); - - link_module_with_cuda_libdevice(module); - - // To prevent potential symbol name conflicts, we use "cuda_vprintf" - // instead of "vprintf" in llvm/runtime.cpp. Now we change it back for - // linking - for (auto &f : *module) { - if (f.getName() == "cuda_vprintf") { - f.setName("vprintf"); + + + if (arch_ == Arch::cuda) { + module->setTargetTriple("nvptx64-nvidia-cuda"); + + #if defined(TI_WITH_CUDA) + auto func = module->getFunction("cuda_compute_capability"); + if (func) { + func->deleteBody(); + auto bb = llvm::BasicBlock::Create(*ctx, "entry", func); + IRBuilder<> builder(*ctx); + builder.SetInsertPoint(bb); + builder.CreateRet( + get_constant(CUDAContext::get_instance().get_compute_capability())); + TaichiLLVMContext::mark_inline(func); + } + #endif + + patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x); + patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64); + patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x); + patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x); + patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x); + patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false); + patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false); + patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false); + patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false); + patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false); + + patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all); + patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync); + + patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any); + patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync); + + patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni); + patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync); + + patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot); + patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync); + + patch_intrinsic("cuda_shfl_down_sync_i32", + Intrinsic::nvvm_shfl_sync_down_i32); + patch_intrinsic("cuda_shfl_down_sync_f32", + Intrinsic::nvvm_shfl_sync_down_f32); + + patch_intrinsic("cuda_shfl_up_sync_i32", Intrinsic::nvvm_shfl_sync_up_i32); + patch_intrinsic("cuda_shfl_up_sync_f32", Intrinsic::nvvm_shfl_sync_up_f32); + + patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32); + + patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32); + + patch_intrinsic("cuda_shfl_xor_sync_i32", + Intrinsic::nvvm_shfl_sync_bfly_i32); + + patch_intrinsic("cuda_match_any_sync_i32", + Intrinsic::nvvm_match_any_sync_i32); + + // LLVM 10.0.0 seems to have a bug on this intrinsic function + /* + nvvm_match_all_sync_i32 + Args: + 1. u32 mask + 2. i32 value + 3. i32 *pred + */ + /* + patch_intrinsic("cuda_match_all_sync_i32p", + Intrinsic::nvvm_math_all_sync_i32); + */ + + // LLVM 10.0.0 seems to have a bug on this intrinsic function + /* + patch_intrinsic("cuda_match_any_sync_i64", + Intrinsic::nvvm_match_any_sync_i64); + */ + + patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true, + {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)}); + patch_intrinsic("cttz_i32", Intrinsic::cttz, true, + {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)}); + + patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false); + + link_module_with_cuda_libdevice(module); + + // To prevent potential symbol name conflicts, we use "cuda_vprintf" + // instead of "vprintf" in llvm/runtime.cpp. Now we change it back for + // linking + for (auto &f : *module) { + if (f.getName() == "cuda_vprintf") { + f.setName("vprintf"); + } } - } - // runtime_module->print(llvm::errs(), nullptr); - } + // runtime_module->print(llvm::errs(), nullptr); + } - if (arch_ == Arch::amdgpu) { - module->setTargetTriple("amdgcn-amd-amdhsa"); - for (auto &f : *module) { - f.addFnAttr("target-cpu",""); - f.addFnAttr("target-features",""); - for (auto &bb: f) { - std::vector alloca_inst_vec; - for (llvm::Instruction &inst : bb) { - llvm::AllocaInst* now_alloca = llvm::dyn_cast(&inst); - if (!now_alloca || - now_alloca->getType()->getAddressSpace() != (unsigned)0) { - continue; - } - alloca_inst_vec.push_back(now_alloca); - } - for (auto &allocainst : alloca_inst_vec) { - auto alloca_type = allocainst->getAllocatedType(); - llvm::IRBuilder<> builder(allocainst); - auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5); - auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0); - new_alloca->setAlignment(llvm::Align(allocainst->getAlignment())); - auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type); - allocainst->replaceAllUsesWith(addrspacecast); - allocainst->eraseFromParent(); + if (arch_ == Arch::amdgpu) { + module->setTargetTriple("amdgcn-amd-amdhsa"); + for (auto &f : *module) { + f.addFnAttr("target-cpu",""); + f.addFnAttr("target-features",""); + for (auto &bb: f) { + std::vector alloca_inst_vec; + for (llvm::Instruction &inst : bb) { + llvm::AllocaInst* now_alloca = llvm::dyn_cast(&inst); + if (!now_alloca || + now_alloca->getType()->getAddressSpace() != (unsigned)0) { + continue; + } + alloca_inst_vec.push_back(now_alloca); + } + for (auto &allocainst : alloca_inst_vec) { + auto alloca_type = allocainst->getAllocatedType(); + llvm::IRBuilder<> builder(allocainst); + auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5); + auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0); + new_alloca->setAlignment(llvm::Align(allocainst->getAlignment())); + auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type); + allocainst->replaceAllUsesWith(addrspacecast); + allocainst->eraseFromParent(); + } } } + patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x); + patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x); } - patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x); - patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x); } return module; From 7fd4225003e58cba22f657466c5806c46fcc32cc Mon Sep 17 00:00:00 2001 From: zeyuli Date: Tue, 1 Nov 2022 01:40:06 +0800 Subject: [PATCH 05/12] add macro control --- taichi/runtime/llvm/llvm_context.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index 87f25433b15f4..6e87b8d7f16fa 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -338,6 +338,7 @@ std::unique_ptr TaichiLLVMContext::module_from_file( std::unique_ptr module = module_from_bitcode_file( fmt::format("{}/{}", runtime_lib_dir(), file), ctx); if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) { +#if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU) auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin, bool ret = true, std::vector types = {}, @@ -391,8 +392,8 @@ std::unique_ptr TaichiLLVMContext::module_from_file( patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add); patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd); patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd); +#endif - if (arch_ == Arch::cuda) { module->setTargetTriple("nvptx64-nvidia-cuda"); @@ -492,6 +493,7 @@ std::unique_ptr TaichiLLVMContext::module_from_file( if (arch_ == Arch::amdgpu) { module->setTargetTriple("amdgcn-amd-amdhsa"); +#ifdef TI_WITH_AMDGPU for (auto &f : *module) { f.addFnAttr("target-cpu",""); f.addFnAttr("target-features",""); @@ -519,6 +521,7 @@ std::unique_ptr TaichiLLVMContext::module_from_file( } patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x); patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x); +#endif } } From 82b4ccb3f14e952864029d848d31f2c571af8566 Mon Sep 17 00:00:00 2001 From: zeyuli Date: Tue, 1 Nov 2022 01:52:05 +0800 Subject: [PATCH 06/12] del extra control --- taichi/runtime/llvm/llvm_context.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index 6e87b8d7f16fa..94cb364339038 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -338,7 +338,6 @@ std::unique_ptr TaichiLLVMContext::module_from_file( std::unique_ptr module = module_from_bitcode_file( fmt::format("{}/{}", runtime_lib_dir(), file), ctx); if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) { -#if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU) auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin, bool ret = true, std::vector types = {}, @@ -392,7 +391,6 @@ std::unique_ptr TaichiLLVMContext::module_from_file( patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add); patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd); patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd); -#endif if (arch_ == Arch::cuda) { module->setTargetTriple("nvptx64-nvidia-cuda"); From d30e24b96a243d8fa76a7b582c4c082aa7ac8cdf Mon Sep 17 00:00:00 2001 From: zeyuli Date: Tue, 1 Nov 2022 01:56:40 +0800 Subject: [PATCH 07/12] fix typo --- taichi/runtime/llvm/llvm_runtime_executor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp index 6d087696ef71b..1fbddac1b3a05 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.cpp +++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp @@ -112,7 +112,7 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, } #endif -#if define(TI_WITH_AMDGPU) +#if defined(TI_WITH_AMDGPU) if (config.arch == Arch::amdgpu) { AMDGPUContext::get_instance().set_debug(config.debug); device_ = std::make_shared(); From 660b98de492de189a39a5b4703e9ca775648e373 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 31 Oct 2022 17:57:53 +0000 Subject: [PATCH 08/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- taichi/runtime/llvm/llvm_context.cpp | 88 +++++++++++++++------------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index 94cb364339038..4c1af07203e5a 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -17,7 +17,7 @@ #include "llvm/IR/IntrinsicsNVPTX.h" #ifdef TI_WITH_AMDGPU #include "llvm/IR/IntrinsicsAMDGPU.h" -#endif //TI_WITH_AMDGPU +#endif // TI_WITH_AMDGPU #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" @@ -338,7 +338,7 @@ std::unique_ptr TaichiLLVMContext::module_from_file( std::unique_ptr module = module_from_bitcode_file( fmt::format("{}/{}", runtime_lib_dir(), file), ctx); if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) { - auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin, + auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin, bool ret = true, std::vector types = {}, std::vector extra_args = {}) { @@ -391,11 +391,11 @@ std::unique_ptr TaichiLLVMContext::module_from_file( patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add); patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd); patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd); - + if (arch_ == Arch::cuda) { module->setTargetTriple("nvptx64-nvidia-cuda"); - #if defined(TI_WITH_CUDA) +#if defined(TI_WITH_CUDA) auto func = module->getFunction("cuda_compute_capability"); if (func) { func->deleteBody(); @@ -406,7 +406,7 @@ std::unique_ptr TaichiLLVMContext::module_from_file( get_constant(CUDAContext::get_instance().get_compute_capability())); TaichiLLVMContext::mark_inline(func); } - #endif +#endif patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x); patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64); @@ -436,8 +436,10 @@ std::unique_ptr TaichiLLVMContext::module_from_file( patch_intrinsic("cuda_shfl_down_sync_f32", Intrinsic::nvvm_shfl_sync_down_f32); - patch_intrinsic("cuda_shfl_up_sync_i32", Intrinsic::nvvm_shfl_sync_up_i32); - patch_intrinsic("cuda_shfl_up_sync_f32", Intrinsic::nvvm_shfl_sync_up_f32); + patch_intrinsic("cuda_shfl_up_sync_i32", + Intrinsic::nvvm_shfl_sync_up_i32); + patch_intrinsic("cuda_shfl_up_sync_f32", + Intrinsic::nvvm_shfl_sync_up_f32); patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32); @@ -493,27 +495,28 @@ std::unique_ptr TaichiLLVMContext::module_from_file( module->setTargetTriple("amdgcn-amd-amdhsa"); #ifdef TI_WITH_AMDGPU for (auto &f : *module) { - f.addFnAttr("target-cpu",""); - f.addFnAttr("target-features",""); - for (auto &bb: f) { - std::vector alloca_inst_vec; + f.addFnAttr("target-cpu", ""); + f.addFnAttr("target-features", ""); + for (auto &bb : f) { + std::vector alloca_inst_vec; for (llvm::Instruction &inst : bb) { - llvm::AllocaInst* now_alloca = llvm::dyn_cast(&inst); - if (!now_alloca || - now_alloca->getType()->getAddressSpace() != (unsigned)0) { - continue; - } - alloca_inst_vec.push_back(now_alloca); + llvm::AllocaInst *now_alloca = llvm::dyn_cast(&inst); + if (!now_alloca || + now_alloca->getType()->getAddressSpace() != (unsigned)0) { + continue; + } + alloca_inst_vec.push_back(now_alloca); } for (auto &allocainst : alloca_inst_vec) { - auto alloca_type = allocainst->getAllocatedType(); - llvm::IRBuilder<> builder(allocainst); - auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5); - auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0); - new_alloca->setAlignment(llvm::Align(allocainst->getAlignment())); - auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type); - allocainst->replaceAllUsesWith(addrspacecast); - allocainst->eraseFromParent(); + auto alloca_type = allocainst->getAllocatedType(); + llvm::IRBuilder<> builder(allocainst); + auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5); + auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0); + new_alloca->setAlignment(llvm::Align(allocainst->getAlignment())); + auto *addrspacecast = + builder.CreateAddrSpaceCast(new_alloca, new_type); + allocainst->replaceAllUsesWith(addrspacecast); + allocainst->eraseFromParent(); } } } @@ -857,7 +860,7 @@ void TaichiLLVMContext::update_runtime_jit_module( } } - if (arch_ == Arch::amdgpu) { + if (arch_ == Arch::amdgpu) { for (auto &f : *module) { bool is_kernel = false; const std::string func_name = f.getName().str(); @@ -875,35 +878,40 @@ void TaichiLLVMContext::update_runtime_jit_module( } for (auto &f : global_func) { llvm::FunctionType *func_type = f->getFunctionType(); - std::vector new_func_params; + std::vector new_func_params; for (auto &arg : f->args()) { if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) { - auto new_type = llvm::PointerType::get(arg.getType()->getPointerElementType(), unsigned(1)); + auto new_type = llvm::PointerType::get( + arg.getType()->getPointerElementType(), unsigned(1)); new_func_params.push_back(new_type); - } - else { + } else { new_func_params.push_back(arg.getType()); } - } - auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), new_func_params, false); - auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), f->getAddressSpace()); - //NF->copyAttributesFrom(f); + } + auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), + new_func_params, false); + auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), + f->getAddressSpace()); + // NF->copyAttributesFrom(f); new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); new_func->setComdat(f->getComdat()); f->getParent()->getFunctionList().insert(f->getIterator(), new_func); new_func->takeName(f); - new_func->getBasicBlockList().splice(new_func->begin(), f->getBasicBlockList()); + new_func->getBasicBlockList().splice(new_func->begin(), + f->getBasicBlockList()); for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(), - I2 = new_func->arg_begin(); I != E; ++I, ++I2) { + I2 = new_func->arg_begin(); + I != E; ++I, ++I2) { if (I->getType()->getTypeID() == llvm::Type::PointerTyID) { auto &front_bb = new_func->getBasicBlockList().front(); - llvm::Instruction *addrspacecast = new AddrSpaceCastInst(I2, I->getType()); - front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), addrspacecast); + llvm::Instruction *addrspacecast = + new AddrSpaceCastInst(I2, I->getType()); + front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), + addrspacecast); I->replaceAllUsesWith(addrspacecast); I2->takeName(&*I); - } - else { + } else { I->replaceAllUsesWith(&*I2); I2->takeName(&*I); } From 1e8c1f465309e927c911ec4ddb8d2a2fc9e7cd7c Mon Sep 17 00:00:00 2001 From: zeyuli Date: Thu, 22 Dec 2022 21:20:07 +0800 Subject: [PATCH 09/12] update pass to llvm_pass --- taichi/runtime/llvm/llvm_context.cpp | 105 +++----------------- taichi/runtime/llvm/llvm_context.h | 2 - taichi/runtime/llvm/llvm_context_pass.h | 126 ++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 94 deletions(-) create mode 100644 taichi/runtime/llvm/llvm_context_pass.h diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index 4c1af07203e5a..beb0e6309feaa 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -50,6 +50,10 @@ #include "llvm_context.h" #include "taichi/runtime/program_impls/llvm/llvm_program.h" #include "taichi/codegen/codegen_utils.h" +#ifdef TI_WITH_AMDGPU +#include "taichi/runtime/llvm/llvm_context_pass.h" +#endif + #ifdef _WIN32 // Travis CI seems doesn't support ... @@ -494,32 +498,13 @@ std::unique_ptr TaichiLLVMContext::module_from_file( if (arch_ == Arch::amdgpu) { module->setTargetTriple("amdgcn-amd-amdhsa"); #ifdef TI_WITH_AMDGPU - for (auto &f : *module) { - f.addFnAttr("target-cpu", ""); - f.addFnAttr("target-features", ""); - for (auto &bb : f) { - std::vector alloca_inst_vec; - for (llvm::Instruction &inst : bb) { - llvm::AllocaInst *now_alloca = llvm::dyn_cast(&inst); - if (!now_alloca || - now_alloca->getType()->getAddressSpace() != (unsigned)0) { - continue; - } - alloca_inst_vec.push_back(now_alloca); - } - for (auto &allocainst : alloca_inst_vec) { - auto alloca_type = allocainst->getAllocatedType(); - llvm::IRBuilder<> builder(allocainst); - auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5); - auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0); - new_alloca->setAlignment(llvm::Align(allocainst->getAlignment())); - auto *addrspacecast = - builder.CreateAddrSpaceCast(new_alloca, new_type); - allocainst->replaceAllUsesWith(addrspacecast); - allocainst->eraseFromParent(); - } - } + llvm::legacy::FunctionPassManager function_pass_manager(module.get()); + function_pass_manager.add(new AMDGPUConvertAllocaInstAddressSpacePass()); + function_pass_manager.doInitialization(); + for (auto func = module->begin(); func != module->end(); ++func) { + function_pass_manager.run(*func); } + function_pass_manager.doFinalization(); patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x); patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x); #endif @@ -749,11 +734,6 @@ void TaichiLLVMContext::mark_function_as_cuda_kernel(llvm::Function *func, } } -void TaichiLLVMContext::mark_function_as_amdgpu_kernel(llvm::Function *func) { - func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); - func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); -} - void TaichiLLVMContext::eliminate_unused_functions( llvm::Module *module, std::function export_indicator) { @@ -861,68 +841,9 @@ void TaichiLLVMContext::update_runtime_jit_module( } if (arch_ == Arch::amdgpu) { - for (auto &f : *module) { - bool is_kernel = false; - const std::string func_name = f.getName().str(); - if (starts_with(func_name, "runtime_")) { - mark_function_as_amdgpu_kernel(&f); - is_kernel = true; - } - if (!is_kernel && !f.isDeclaration()) - f.setLinkage(llvm::Function::PrivateLinkage); - } - std::vector global_func; - for (auto &f : *module) { - if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL) - global_func.push_back(&f); - } - for (auto &f : global_func) { - llvm::FunctionType *func_type = f->getFunctionType(); - std::vector new_func_params; - for (auto &arg : f->args()) { - if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) { - auto new_type = llvm::PointerType::get( - arg.getType()->getPointerElementType(), unsigned(1)); - new_func_params.push_back(new_type); - } else { - new_func_params.push_back(arg.getType()); - } - } - auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), - new_func_params, false); - auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), - f->getAddressSpace()); - // NF->copyAttributesFrom(f); - new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); - new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); - new_func->setComdat(f->getComdat()); - f->getParent()->getFunctionList().insert(f->getIterator(), new_func); - new_func->takeName(f); - new_func->getBasicBlockList().splice(new_func->begin(), - f->getBasicBlockList()); - for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(), - I2 = new_func->arg_begin(); - I != E; ++I, ++I2) { - if (I->getType()->getTypeID() == llvm::Type::PointerTyID) { - auto &front_bb = new_func->getBasicBlockList().front(); - llvm::Instruction *addrspacecast = - new AddrSpaceCastInst(I2, I->getType()); - front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), - addrspacecast); - I->replaceAllUsesWith(addrspacecast); - I2->takeName(&*I); - } else { - I->replaceAllUsesWith(&*I2); - I2->takeName(&*I); - } - } - - SmallVector, 1> MDs; - f->getAllMetadata(MDs); - for (auto [KindID, Node] : MDs) - new_func->addMetadata(KindID, *Node); - f->eraseFromParent(); - } + llvm::legacy::PassManager module_pass_manager; + module_pass_manager.add(new AMDGPUConvertFuncParamAddressSpacePass()); + module_pass_manager.run(*module); } eliminate_unused_functions(module.get(), [](std::string func_name) { diff --git a/taichi/runtime/llvm/llvm_context.h b/taichi/runtime/llvm/llvm_context.h index 58ca21f68c0d6..ddea66efd763d 100644 --- a/taichi/runtime/llvm/llvm_context.h +++ b/taichi/runtime/llvm/llvm_context.h @@ -125,8 +125,6 @@ class TaichiLLVMContext { void mark_function_as_cuda_kernel(llvm::Function *func, int block_dim = 0); - void mark_function_as_amdgpu_kernel(llvm::Function *func); - void fetch_this_thread_struct_module(); llvm::Module *get_this_thread_runtime_module(); llvm::Function *get_runtime_function(const std::string &name); diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h new file mode 100644 index 0000000000000..c6d2cd7a64ada --- /dev/null +++ b/taichi/runtime/llvm/llvm_context_pass.h @@ -0,0 +1,126 @@ +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" +#include "llvm/IR/Module.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Linker/Linker.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/Host.h" +#include "llvm/MC/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/CodeGen.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" + +#if defined(TI_WITH_AMDGPU) +#include "taichi/rhi/amdgpu/amdgpu_context.h" +#endif + +namespace taichi { +namespace lang { +using namespace llvm; +struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass { + static char ID; + AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) {} + bool runOnFunction(llvm::Function &f) override { + f.addFnAttr("target-cpu", "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3,4)); + f.addFnAttr("target-features",""); + for (auto &bb: f) { + std::vector alloca_inst_vec; + for (Instruction &inst : bb) { + AllocaInst* now_alloca = dyn_cast(&inst); + if (!now_alloca || + now_alloca->getType()->getAddressSpace() != (unsigned)0) { + continue; + } + alloca_inst_vec.push_back(now_alloca); + } + for (auto &allocainst : alloca_inst_vec) { + auto alloca_type = allocainst->getAllocatedType(); + IRBuilder<> builder(allocainst); + auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5); + auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0); + new_alloca->setAlignment(Align(allocainst->getAlign().value())); + auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type); + allocainst->replaceAllUsesWith(addrspacecast); + allocainst->eraseFromParent(); + } + } + return false; + } +}; + +struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass { + static char ID; + AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) {} + bool runOnModule(llvm::Module &M) override { + for (auto &f : M) { + bool is_kernel = false; + const std::string func_name = f.getName().str(); + if (starts_with(func_name, "runtime_")) { + f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); + f.addFnAttr("amdgpu-flat-work-group-size", "1, 256"); + is_kernel = true; + } + if (!is_kernel && !f.isDeclaration()) + f.setLinkage(llvm::Function::PrivateLinkage); + } + std::vector global_func; + for (auto &f : M) { + if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL) + global_func.push_back(&f); + } + for (auto &f : global_func) { + llvm::FunctionType *func_type = f->getFunctionType(); + std::vector new_func_params; + for (auto &arg : f->args()) { + if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) { + auto new_type = llvm::PointerType::get(arg.getType()->getPointerElementType(), unsigned(1)); + new_func_params.push_back(new_type); + } + else { + new_func_params.push_back(arg.getType()); + } + } + auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), new_func_params, false); + auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), f->getAddressSpace()); + new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); + new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); + new_func->addFnAttr("target-cpu", "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3,4)); + new_func->setComdat(f->getComdat()); + f->getParent()->getFunctionList().insert(f->getIterator(), new_func); + new_func->takeName(f); + new_func->getBasicBlockList().splice(new_func->begin(), f->getBasicBlockList()); + for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(), + I2 = new_func->arg_begin(); I != E; ++I, ++I2) { + if (I->getType()->getTypeID() == llvm::Type::PointerTyID) { + auto &front_bb = new_func->getBasicBlockList().front(); + llvm::Instruction *addrspacecast = new AddrSpaceCastInst(I2, I->getType()); + front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), addrspacecast); + I->replaceAllUsesWith(addrspacecast); + I2->takeName(&*I); + } + else { + I->replaceAllUsesWith(&*I2); + I2->takeName(&*I); + } + } + + f->eraseFromParent(); + } + return false; + } +}; + +char AMDGPUConvertAllocaInstAddressSpacePass::ID = 0; +char AMDGPUConvertFuncParamAddressSpacePass::ID = 0; + +} // namespace lang +} // namespace taichi \ No newline at end of file From 325732a1d1307e29337396881c0629d21898c4e6 Mon Sep 17 00:00:00 2001 From: zeyuli Date: Thu, 22 Dec 2022 21:28:04 +0800 Subject: [PATCH 10/12] fix bug and solve conversation --- taichi/runtime/llvm/llvm_context.cpp | 2 ++ taichi/runtime/llvm/llvm_context_pass.h | 13 +++++++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index beb0e6309feaa..8cd5cf4e785dd 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -841,9 +841,11 @@ void TaichiLLVMContext::update_runtime_jit_module( } if (arch_ == Arch::amdgpu) { +#ifdef TI_WITH_AMDGPU llvm::legacy::PassManager module_pass_manager; module_pass_manager.add(new AMDGPUConvertFuncParamAddressSpacePass()); module_pass_manager.run(*module); +#endif } eliminate_unused_functions(module.get(), [](std::string func_name) { diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h index c6d2cd7a64ada..b1a09325c4ea5 100644 --- a/taichi/runtime/llvm/llvm_context_pass.h +++ b/taichi/runtime/llvm/llvm_context_pass.h @@ -66,18 +66,23 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass { const std::string func_name = f.getName().str(); if (starts_with(func_name, "runtime_")) { f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); - f.addFnAttr("amdgpu-flat-work-group-size", "1, 256"); + // ref https://llvm.org/docs/AMDGPUUsage.html + // “amdgpu-flat-work-group-size”=”min,max” + // Specify the minimum and maximum flat work group sizes that will be specified when the kernel is dispatched. + // Generated by the amdgpu_flat_work_group_size CLANG attribute [CLANG-ATTR]. + // The implied default value is 1,1024. + f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); is_kernel = true; } if (!is_kernel && !f.isDeclaration()) f.setLinkage(llvm::Function::PrivateLinkage); } - std::vector global_func; + std::vector kernel_function; for (auto &f : M) { if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL) - global_func.push_back(&f); + kernel_function.push_back(&f); } - for (auto &f : global_func) { + for (auto &f : kernel_function) { llvm::FunctionType *func_type = f->getFunctionType(); std::vector new_func_params; for (auto &arg : f->args()) { From bd6b58d54232bcacdf98eab444c2c362e30f852a Mon Sep 17 00:00:00 2001 From: zeyuli Date: Fri, 23 Dec 2022 23:21:14 +0800 Subject: [PATCH 11/12] del extra header file in llvm_context_pass.h --- taichi/runtime/llvm/llvm_context_pass.h | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h index b1a09325c4ea5..33006812ebeb6 100644 --- a/taichi/runtime/llvm/llvm_context_pass.h +++ b/taichi/runtime/llvm/llvm_context_pass.h @@ -1,19 +1,10 @@ -#include "llvm/Analysis/TargetTransformInfo.h" +#pragma once + #include "llvm/IR/LLVMContext.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Function.h" #include "llvm/Pass.h" #include "llvm/IR/Module.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/Linker/Linker.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Support/Host.h" -#include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/CodeGen.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/IR/Instructions.h" @@ -26,6 +17,7 @@ namespace taichi { namespace lang { using namespace llvm; +#if defined(TI_WITH_AMDGPU) struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass { static char ID; AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) {} @@ -126,6 +118,7 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass { char AMDGPUConvertAllocaInstAddressSpacePass::ID = 0; char AMDGPUConvertFuncParamAddressSpacePass::ID = 0; +#endif } // namespace lang -} // namespace taichi \ No newline at end of file +} // namespace taichi From c015143617602a4ce2a42e17e685640fa4354a65 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 23 Dec 2022 15:22:38 +0000 Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- taichi/runtime/llvm/llvm_context.cpp | 1 - taichi/runtime/llvm/llvm_context_pass.h | 187 +++++++++++++----------- 2 files changed, 99 insertions(+), 89 deletions(-) diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp index 8cd5cf4e785dd..31ee42da2791f 100644 --- a/taichi/runtime/llvm/llvm_context.cpp +++ b/taichi/runtime/llvm/llvm_context.cpp @@ -54,7 +54,6 @@ #include "taichi/runtime/llvm/llvm_context_pass.h" #endif - #ifdef _WIN32 // Travis CI seems doesn't support ... #include diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h index 33006812ebeb6..d48303dbc6670 100644 --- a/taichi/runtime/llvm/llvm_context_pass.h +++ b/taichi/runtime/llvm/llvm_context_pass.h @@ -1,5 +1,5 @@ #pragma once - + #include "llvm/IR/LLVMContext.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Function.h" @@ -19,106 +19,117 @@ namespace lang { using namespace llvm; #if defined(TI_WITH_AMDGPU) struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass { - static char ID; - AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) {} - bool runOnFunction(llvm::Function &f) override { - f.addFnAttr("target-cpu", "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3,4)); - f.addFnAttr("target-features",""); - for (auto &bb: f) { - std::vector alloca_inst_vec; - for (Instruction &inst : bb) { - AllocaInst* now_alloca = dyn_cast(&inst); - if (!now_alloca || - now_alloca->getType()->getAddressSpace() != (unsigned)0) { - continue; - } - alloca_inst_vec.push_back(now_alloca); - } - for (auto &allocainst : alloca_inst_vec) { - auto alloca_type = allocainst->getAllocatedType(); - IRBuilder<> builder(allocainst); - auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5); - auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0); - new_alloca->setAlignment(Align(allocainst->getAlign().value())); - auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type); - allocainst->replaceAllUsesWith(addrspacecast); - allocainst->eraseFromParent(); - } + static char ID; + AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) { + } + bool runOnFunction(llvm::Function &f) override { + f.addFnAttr("target-cpu", + "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4)); + f.addFnAttr("target-features", ""); + for (auto &bb : f) { + std::vector alloca_inst_vec; + for (Instruction &inst : bb) { + AllocaInst *now_alloca = dyn_cast(&inst); + if (!now_alloca || + now_alloca->getType()->getAddressSpace() != (unsigned)0) { + continue; } - return false; + alloca_inst_vec.push_back(now_alloca); + } + for (auto &allocainst : alloca_inst_vec) { + auto alloca_type = allocainst->getAllocatedType(); + IRBuilder<> builder(allocainst); + auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5); + auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0); + new_alloca->setAlignment(Align(allocainst->getAlign().value())); + auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type); + allocainst->replaceAllUsesWith(addrspacecast); + allocainst->eraseFromParent(); + } } + return false; + } }; struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass { - static char ID; - AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) {} - bool runOnModule(llvm::Module &M) override { - for (auto &f : M) { - bool is_kernel = false; - const std::string func_name = f.getName().str(); - if (starts_with(func_name, "runtime_")) { - f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); - // ref https://llvm.org/docs/AMDGPUUsage.html - // “amdgpu-flat-work-group-size”=”min,max” - // Specify the minimum and maximum flat work group sizes that will be specified when the kernel is dispatched. - // Generated by the amdgpu_flat_work_group_size CLANG attribute [CLANG-ATTR]. - // The implied default value is 1,1024. - f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); - is_kernel = true; - } - if (!is_kernel && !f.isDeclaration()) - f.setLinkage(llvm::Function::PrivateLinkage); - } - std::vector kernel_function; - for (auto &f : M) { - if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL) - kernel_function.push_back(&f); + static char ID; + AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) { + } + bool runOnModule(llvm::Module &M) override { + for (auto &f : M) { + bool is_kernel = false; + const std::string func_name = f.getName().str(); + if (starts_with(func_name, "runtime_")) { + f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); + // ref https://llvm.org/docs/AMDGPUUsage.html + // “amdgpu-flat-work-group-size”=”min,max” + // Specify the minimum and maximum flat work group sizes that will be + // specified when the kernel is dispatched. Generated by the + // amdgpu_flat_work_group_size CLANG attribute [CLANG-ATTR]. The implied + // default value is 1,1024. + f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); + is_kernel = true; } - for (auto &f : kernel_function) { - llvm::FunctionType *func_type = f->getFunctionType(); - std::vector new_func_params; - for (auto &arg : f->args()) { - if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) { - auto new_type = llvm::PointerType::get(arg.getType()->getPointerElementType(), unsigned(1)); - new_func_params.push_back(new_type); - } - else { - new_func_params.push_back(arg.getType()); - } + if (!is_kernel && !f.isDeclaration()) + f.setLinkage(llvm::Function::PrivateLinkage); + } + std::vector kernel_function; + for (auto &f : M) { + if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL) + kernel_function.push_back(&f); + } + for (auto &f : kernel_function) { + llvm::FunctionType *func_type = f->getFunctionType(); + std::vector new_func_params; + for (auto &arg : f->args()) { + if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) { + auto new_type = llvm::PointerType::get( + arg.getType()->getPointerElementType(), unsigned(1)); + new_func_params.push_back(new_type); + } else { + new_func_params.push_back(arg.getType()); } - auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), new_func_params, false); - auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), f->getAddressSpace()); - new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); - new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); - new_func->addFnAttr("target-cpu", "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3,4)); - new_func->setComdat(f->getComdat()); - f->getParent()->getFunctionList().insert(f->getIterator(), new_func); - new_func->takeName(f); - new_func->getBasicBlockList().splice(new_func->begin(), f->getBasicBlockList()); - for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(), - I2 = new_func->arg_begin(); I != E; ++I, ++I2) { - if (I->getType()->getTypeID() == llvm::Type::PointerTyID) { - auto &front_bb = new_func->getBasicBlockList().front(); - llvm::Instruction *addrspacecast = new AddrSpaceCastInst(I2, I->getType()); - front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), addrspacecast); - I->replaceAllUsesWith(addrspacecast); - I2->takeName(&*I); - } - else { - I->replaceAllUsesWith(&*I2); - I2->takeName(&*I); - } + } + auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), + new_func_params, false); + auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), + f->getAddressSpace()); + new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); + new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); + new_func->addFnAttr( + "target-cpu", + "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4)); + new_func->setComdat(f->getComdat()); + f->getParent()->getFunctionList().insert(f->getIterator(), new_func); + new_func->takeName(f); + new_func->getBasicBlockList().splice(new_func->begin(), + f->getBasicBlockList()); + for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(), + I2 = new_func->arg_begin(); + I != E; ++I, ++I2) { + if (I->getType()->getTypeID() == llvm::Type::PointerTyID) { + auto &front_bb = new_func->getBasicBlockList().front(); + llvm::Instruction *addrspacecast = + new AddrSpaceCastInst(I2, I->getType()); + front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), + addrspacecast); + I->replaceAllUsesWith(addrspacecast); + I2->takeName(&*I); + } else { + I->replaceAllUsesWith(&*I2); + I2->takeName(&*I); } - - f->eraseFromParent(); } - return false; + + f->eraseFromParent(); } + return false; + } }; char AMDGPUConvertAllocaInstAddressSpacePass::ID = 0; char AMDGPUConvertFuncParamAddressSpacePass::ID = 0; #endif -} // namespace lang -} // namespace taichi +} // namespace lang +} // namespace taichi