[amdgpu] Part3 update runtime module (#6486)

Issue: #6434 ### Brief Summary 1. This is a special part of the Tacihi runtime module for the `AMDGPU` backend. Tacihi's runtime module uses `clang++` to generate `LLVM IR` is different in memory allocation differs from the cpu-generated `LLVM IR`. The following is an example. ``` C/C++ code void func(int *a, int *b) { *a = *b; } x86_64 backend LLVM IR define dso_local void @cpu_func(i32* %0, i32* %1) #2 { %3 = alloca i32*, align 8 %4 = alloca i32*, align 8 store i32* %0, i32** %3, align 8 store i32* %1, i32** %4, align 8 %5 = load i32*, i32** %4, align 8 %6 = load i32, i32* %5, align 4 %7 = load i32*, i32** %3, align 8 store i32 %6, i32* %7, align 4 ret void } __global__ function on AMDGPU define protected amdgpu_kernel void @global_func(i32 addrspace(1)* %0, i32 addrspace(1)* %1) #4 { %3 = alloca i32*, align 8, addrspace(5) %4 = alloca i32*, align 8, addrspace(5) %5 = alloca i32*, align 8, addrspace(5) %6 = alloca i32*, align 8, addrspace(5) %7 = addrspacecast i32* addrspace(5)* %3 to i32** %8 = addrspacecast i32* addrspace(5)* %4 to i32** %9 = addrspacecast i32* addrspace(5)* %5 to i32** %10 = addrspacecast i32* addrspace(5)* %6 to i32** %11 = addrspacecast i32 addrspace(1)* %0 to i32* store i32* %11, i32** %7, align 8 %12 = load i32*, i32** %7, align 8 %13 = addrspacecast i32 addrspace(1)* %1 to i32* store i32* %13, i32** %8, align 8 %14 = load i32*, i32** %8, align 8 store i32* %12, i32** %9, align 8 store i32* %14, i32** %10, align 8 %15 = load i32*, i32** %10, align 8 %16 = load i32, i32* %15, align 4 %17 = load i32*, i32** %9, align 8 store i32 %16, i32* %17, align 4 ret void } __device__ function on AMDGPU define hidden void @device_func(i32* %0, i32* %1) #2 { %3 = alloca i32*, align 8, addrspace(5) %4 = alloca i32*, align 8, addrspace(5) %5 = addrspacecast i32* addrspace(5)* %3 to i32** %6 = addrspacecast i32* addrspace(5)* %4 to i32** store i32* %0, i32** %5, align 8 store i32* %1, i32** %6, align 8 %7 = load i32*, i32** %6, align 8 %8 = load i32, i32* %7, align 4 %9 = load i32*, i32** %5, align 8 store i32 %8, i32* %9, align 4 ret void } ``` 2. There are some differences in the place about `allocainst`, specifically about addrspace (for `AMDGPU`, [this](https://llvm.org/docs/AMDGPUUsage.html#address-spaces) will be helpful). I have not found documentation describing how to write the correct `LLVM IR` on `AMDGPU`, through my observation of the `LLVM IR` generated by `clang++/hipcc`. We need to deal with the arguments of the `__global__` function and the `allocainst` (including specifying the addrspace of `allocainst` and performing addrspace-cast) while for the `__device__` function we do not need to deal with the arguments of the function. Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
taichi-dev · Dec 30, 2022 · d321d12 · d321d12
1 parent c1844bd
commit d321d12
Show file tree

Hide file tree

Showing 5 changed files with 282 additions and 94 deletions.
diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
@@ -15,6 +15,9 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
+#ifdef TI_WITH_AMDGPU
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#endif  // TI_WITH_AMDGPU
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -43,6 +46,9 @@
 #include "llvm_context.h"
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 #include "taichi/codegen/codegen_utils.h"
+#ifdef TI_WITH_AMDGPU
+#include "taichi/runtime/llvm/llvm_context_pass.h"
+#endif
 
 #ifdef _WIN32
 // Travis CI seems doesn't support <filesystem>...
@@ -331,22 +337,7 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
   auto ctx = get_this_thread_context();
   std::unique_ptr<llvm::Module> module = module_from_bitcode_file(
       fmt::format("{}/{}", runtime_lib_dir(), file), ctx);
-  if (arch_ == Arch::cuda) {
-    module->setTargetTriple("nvptx64-nvidia-cuda");
-
-#if defined(TI_WITH_CUDA)
-    auto func = module->getFunction("cuda_compute_capability");
-    if (func) {
-      func->deleteBody();
-      auto bb = llvm::BasicBlock::Create(*ctx, "entry", func);
-      IRBuilder<> builder(*ctx);
-      builder.SetInsertPoint(bb);
-      builder.CreateRet(
-          get_constant(CUDAContext::get_instance().get_compute_capability()));
-      TaichiLLVMContext::mark_inline(func);
-    }
-#endif
-
+  if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) {
     auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin,
                                bool ret = true,
                                std::vector<llvm::Type *> types = {},
@@ -391,93 +382,124 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
       TaichiLLVMContext::mark_inline(func);
     };
 
-    patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x);
-    patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64);
-    patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
-    patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x);
-    patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
-    patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false);
-    patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false);
-    patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
-    patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false);
-    patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false);
-
-    patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all);
-    patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync);
-
-    patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any);
-    patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync);
-
-    patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni);
-    patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync);
-
-    patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot);
-    patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync);
-
-    patch_intrinsic("cuda_shfl_down_sync_i32",
-                    Intrinsic::nvvm_shfl_sync_down_i32);
-    patch_intrinsic("cuda_shfl_down_sync_f32",
-                    Intrinsic::nvvm_shfl_sync_down_f32);
-
-    patch_intrinsic("cuda_shfl_up_sync_i32", Intrinsic::nvvm_shfl_sync_up_i32);
-    patch_intrinsic("cuda_shfl_up_sync_f32", Intrinsic::nvvm_shfl_sync_up_f32);
-
-    patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32);
-
-    patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32);
-
-    patch_intrinsic("cuda_shfl_xor_sync_i32",
-                    Intrinsic::nvvm_shfl_sync_bfly_i32);
-
-    patch_intrinsic("cuda_match_any_sync_i32",
-                    Intrinsic::nvvm_match_any_sync_i32);
-
-    // LLVM 10.0.0 seems to have a bug on this intrinsic function
-    /*
-    nvvm_match_all_sync_i32
-    Args:
-        1. u32 mask
-        2. i32 value
-        3. i32 *pred
-    */
-    /*
-    patch_intrinsic("cuda_match_all_sync_i32p",
-                    Intrinsic::nvvm_math_all_sync_i32);
-    */
-
-    // LLVM 10.0.0 seems to have a bug on this intrinsic function
-    /*
-    patch_intrinsic("cuda_match_any_sync_i64",
-                    Intrinsic::nvvm_match_any_sync_i64);
-                    */
-
-    patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true,
-                    {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
-    patch_intrinsic("cttz_i32", Intrinsic::cttz, true,
-                    {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
-
     patch_atomic_add("atomic_add_i32", llvm::AtomicRMWInst::Add);
-
     patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add);
-
-    patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);
-
     patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd);
+    patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);
 
-    patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
+    if (arch_ == Arch::cuda) {
+      module->setTargetTriple("nvptx64-nvidia-cuda");
 
-    link_module_with_cuda_libdevice(module);
+#if defined(TI_WITH_CUDA)
+      auto func = module->getFunction("cuda_compute_capability");
+      if (func) {
+        func->deleteBody();
+        auto bb = llvm::BasicBlock::Create(*ctx, "entry", func);
+        IRBuilder<> builder(*ctx);
+        builder.SetInsertPoint(bb);
+        builder.CreateRet(
+            get_constant(CUDAContext::get_instance().get_compute_capability()));
+        TaichiLLVMContext::mark_inline(func);
+      }
+#endif
 
-    // To prevent potential symbol name conflicts, we use "cuda_vprintf"
-    // instead of "vprintf" in llvm/runtime.cpp. Now we change it back for
-    // linking
-    for (auto &f : *module) {
-      if (f.getName() == "cuda_vprintf") {
-        f.setName("vprintf");
+      patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x);
+      patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64);
+      patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
+      patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x);
+      patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
+      patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false);
+      patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false);
+      patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
+      patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false);
+      patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false);
+
+      patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all);
+      patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync);
+
+      patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any);
+      patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync);
+
+      patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni);
+      patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync);
+
+      patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot);
+      patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync);
+
+      patch_intrinsic("cuda_shfl_down_sync_i32",
+                      Intrinsic::nvvm_shfl_sync_down_i32);
+      patch_intrinsic("cuda_shfl_down_sync_f32",
+                      Intrinsic::nvvm_shfl_sync_down_f32);
+
+      patch_intrinsic("cuda_shfl_up_sync_i32",
+                      Intrinsic::nvvm_shfl_sync_up_i32);
+      patch_intrinsic("cuda_shfl_up_sync_f32",
+                      Intrinsic::nvvm_shfl_sync_up_f32);
+
+      patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32);
+
+      patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32);
+
+      patch_intrinsic("cuda_shfl_xor_sync_i32",
+                      Intrinsic::nvvm_shfl_sync_bfly_i32);
+
+      patch_intrinsic("cuda_match_any_sync_i32",
+                      Intrinsic::nvvm_match_any_sync_i32);
+
+      // LLVM 10.0.0 seems to have a bug on this intrinsic function
+      /*
+      nvvm_match_all_sync_i32
+      Args:
+          1. u32 mask
+          2. i32 value
+          3. i32 *pred
+      */
+      /*
+      patch_intrinsic("cuda_match_all_sync_i32p",
+                      Intrinsic::nvvm_math_all_sync_i32);
+      */
+
+      // LLVM 10.0.0 seems to have a bug on this intrinsic function
+      /*
+      patch_intrinsic("cuda_match_any_sync_i64",
+                      Intrinsic::nvvm_match_any_sync_i64);
+                      */
+
+      patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true,
+                      {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
+      patch_intrinsic("cttz_i32", Intrinsic::cttz, true,
+                      {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
+
+      patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
+
+      link_module_with_cuda_libdevice(module);
+
+      // To prevent potential symbol name conflicts, we use "cuda_vprintf"
+      // instead of "vprintf" in llvm/runtime.cpp. Now we change it back for
+      // linking
+      for (auto &f : *module) {
+        if (f.getName() == "cuda_vprintf") {
+          f.setName("vprintf");
+        }
       }
+
+      // runtime_module->print(llvm::errs(), nullptr);
     }
 
-    // runtime_module->print(llvm::errs(), nullptr);
+    if (arch_ == Arch::amdgpu) {
+      module->setTargetTriple("amdgcn-amd-amdhsa");
+#ifdef TI_WITH_AMDGPU
+      llvm::legacy::FunctionPassManager function_pass_manager(module.get());
+      function_pass_manager.add(new AMDGPUConvertAllocaInstAddressSpacePass());
+      function_pass_manager.doInitialization();
+      for (auto func = module->begin(); func != module->end(); ++func) {
+        function_pass_manager.run(*func);
+      }
+      function_pass_manager.doFinalization();
+      patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x);
+      patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x);
+#endif
+    }
   }
 
   return module;
@@ -796,6 +818,14 @@ void TaichiLLVMContext::update_runtime_jit_module(
     }
   }
 
+  if (arch_ == Arch::amdgpu) {
+#ifdef TI_WITH_AMDGPU
+    llvm::legacy::PassManager module_pass_manager;
+    module_pass_manager.add(new AMDGPUConvertFuncParamAddressSpacePass());
+    module_pass_manager.run(*module);
+#endif
+  }
+
   eliminate_unused_functions(module.get(), [](std::string func_name) {
     return starts_with(func_name, "runtime_") ||
            starts_with(func_name, "LLVMRuntime_");

diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h
@@ -0,0 +1,135 @@
+#pragma once
+
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/rhi/amdgpu/amdgpu_context.h"
+#endif
+
+namespace taichi {
+namespace lang {
+using namespace llvm;
+#if defined(TI_WITH_AMDGPU)
+struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass {
+  static char ID;
+  AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) {
+  }
+  bool runOnFunction(llvm::Function &f) override {
+    f.addFnAttr("target-cpu",
+                "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4));
+    f.addFnAttr("target-features", "");
+    for (auto &bb : f) {
+      std::vector<AllocaInst *> alloca_inst_vec;
+      for (Instruction &inst : bb) {
+        AllocaInst *now_alloca = dyn_cast<AllocaInst>(&inst);
+        if (!now_alloca ||
+            now_alloca->getType()->getAddressSpace() != (unsigned)0) {
+          continue;
+        }
+        alloca_inst_vec.push_back(now_alloca);
+      }
+      for (auto &allocainst : alloca_inst_vec) {
+        auto alloca_type = allocainst->getAllocatedType();
+        IRBuilder<> builder(allocainst);
+        auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
+        auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
+        new_alloca->setAlignment(Align(allocainst->getAlign().value()));
+        auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type);
+        allocainst->replaceAllUsesWith(addrspacecast);
+        allocainst->eraseFromParent();
+      }
+    }
+    return false;
+  }
+};
+
+struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
+  static char ID;
+  AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) {
+  }
+  bool runOnModule(llvm::Module &M) override {
+    for (auto &f : M) {
+      bool is_kernel = false;
+      const std::string func_name = f.getName().str();
+      if (starts_with(func_name, "runtime_")) {
+        f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+        // ref https://llvm.org/docs/AMDGPUUsage.html
+        // “amdgpu-flat-work-group-size”=”min,max”
+        // Specify the minimum and maximum flat work group sizes that will be
+        // specified when the kernel is dispatched. Generated by the
+        // amdgpu_flat_work_group_size CLANG attribute [CLANG-ATTR]. The implied
+        // default value is 1,1024.
+        f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
+        is_kernel = true;
+      }
+      if (!is_kernel && !f.isDeclaration())
+        f.setLinkage(llvm::Function::PrivateLinkage);
+    }
+    std::vector<llvm::Function *> kernel_function;
+    for (auto &f : M) {
+      if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL)
+        kernel_function.push_back(&f);
+    }
+    for (auto &f : kernel_function) {
+      llvm::FunctionType *func_type = f->getFunctionType();
+      std::vector<llvm::Type *> new_func_params;
+      for (auto &arg : f->args()) {
+        if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
+          auto new_type = llvm::PointerType::get(
+              arg.getType()->getPointerElementType(), unsigned(1));
+          new_func_params.push_back(new_type);
+        } else {
+          new_func_params.push_back(arg.getType());
+        }
+      }
+      auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(),
+                                                   new_func_params, false);
+      auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(),
+                                             f->getAddressSpace());
+      new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+      new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
+      new_func->addFnAttr(
+          "target-cpu",
+          "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4));
+      new_func->setComdat(f->getComdat());
+      f->getParent()->getFunctionList().insert(f->getIterator(), new_func);
+      new_func->takeName(f);
+      new_func->getBasicBlockList().splice(new_func->begin(),
+                                           f->getBasicBlockList());
+      for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
+                                        I2 = new_func->arg_begin();
+           I != E; ++I, ++I2) {
+        if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
+          auto &front_bb = new_func->getBasicBlockList().front();
+          llvm::Instruction *addrspacecast =
+              new AddrSpaceCastInst(I2, I->getType());
+          front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(),
+                                             addrspacecast);
+          I->replaceAllUsesWith(addrspacecast);
+          I2->takeName(&*I);
+        } else {
+          I->replaceAllUsesWith(&*I2);
+          I2->takeName(&*I);
+        }
+      }
+
+      f->eraseFromParent();
+    }
+    return false;
+  }
+};
+
+char AMDGPUConvertAllocaInstAddressSpacePass::ID = 0;
+char AMDGPUConvertFuncParamAddressSpacePass::ID = 0;
+#endif
+
+}  // namespace lang
+}  // namespace taichi