From e5a32a1fc8c376b9554e178dfa82fd6a45cae748 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Mon, 31 Oct 2022 21:13:30 +0800
Subject: [PATCH 01/12] alter global function's arg address

---
 taichi/runtime/llvm/llvm_context.cpp | 65 ++++++++++++++++++++++++++++
 taichi/runtime/llvm/llvm_context.h   |  2 +
 2 files changed, 67 insertions(+)

diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index 95b129ef2a7e8..f8e3a1c8ee4f3 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -711,6 +711,11 @@ void TaichiLLVMContext::mark_function_as_cuda_kernel(llvm::Function *func,
   }
 }
 
+void TaichiLLVMContext::mark_function_as_amdgpu_kernel(llvm::Function *func) {
+  func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+  func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
+}
+
 void TaichiLLVMContext::eliminate_unused_functions(
     llvm::Module *module,
     std::function<bool(const std::string &)> export_indicator) {
@@ -817,6 +822,66 @@ void TaichiLLVMContext::update_runtime_jit_module(
     }
   }
 
+    if (arch_ == Arch::amdgpu) {
+    for (auto &f : *module) {
+      bool is_kernel = false;
+      const std::string func_name = f.getName().str();
+      if (starts_with(func_name, "runtime_")) {
+        mark_function_as_amdgpu_kernel(&f);
+        is_kernel = true;
+      }
+      if (!is_kernel && !f.isDeclaration())
+        f.setLinkage(llvm::Function::PrivateLinkage);
+    }
+    std::vector<llvm::Function *> global_func;
+    for (auto &f : *module) {
+      if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL)
+        global_func.push_back(&f);
+    }
+    for (auto &f : global_func) {
+      llvm::FunctionType *func_type = f->getFunctionType();
+      std::vector<llvm::Type*> new_func_params;
+      for (auto &arg : f->args()) {
+        if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
+          auto new_type = llvm::PointerType::get(arg.getType()->getPointerElementType(), unsigned(1));
+          new_func_params.push_back(new_type);
+        }
+        else {
+          new_func_params.push_back(arg.getType());
+        }
+      } 
+      auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), new_func_params, false);
+      auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), f->getAddressSpace());
+      //NF->copyAttributesFrom(f);
+      new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+      new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
+      new_func->setComdat(f->getComdat());
+      f->getParent()->getFunctionList().insert(f->getIterator(), new_func);
+      new_func->takeName(f);
+      new_func->getBasicBlockList().splice(new_func->begin(), f->getBasicBlockList());
+      for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
+                                  I2 = new_func->arg_begin(); I != E; ++I, ++I2) {
+        if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
+          auto &front_bb = new_func->getBasicBlockList().front();
+          llvm::Instruction *addrspacecast = new AddrSpaceCastInst(I2, I->getType());
+          front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), addrspacecast);
+          I->replaceAllUsesWith(addrspacecast);
+          I2->takeName(&*I);
+        } 
+        else {
+          I->replaceAllUsesWith(&*I2);
+          I2->takeName(&*I);
+        }
+      }
+
+      SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+      f->getAllMetadata(MDs);
+      for (auto [KindID, Node] : MDs)
+        new_func->addMetadata(KindID, *Node);
+      f->eraseFromParent();
+    }
+  }
+
   eliminate_unused_functions(module.get(), [](std::string func_name) {
     return starts_with(func_name, "runtime_") ||
            starts_with(func_name, "LLVMRuntime_");
diff --git a/taichi/runtime/llvm/llvm_context.h b/taichi/runtime/llvm/llvm_context.h
index ddea66efd763d..58ca21f68c0d6 100644
--- a/taichi/runtime/llvm/llvm_context.h
+++ b/taichi/runtime/llvm/llvm_context.h
@@ -125,6 +125,8 @@ class TaichiLLVMContext {
 
   void mark_function_as_cuda_kernel(llvm::Function *func, int block_dim = 0);
 
+  void mark_function_as_amdgpu_kernel(llvm::Function *func);
+
   void fetch_this_thread_struct_module();
   llvm::Module *get_this_thread_runtime_module();
   llvm::Function *get_runtime_function(const std::string &name);

From 8e91537d4de00056a46aeebf6723189300bd2cef Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Mon, 31 Oct 2022 21:22:45 +0800
Subject: [PATCH 02/12] add misc api

---
 taichi/runtime/llvm/llvm_runtime_executor.cpp   | 17 +++++++++++++++++
 taichi/runtime/llvm/llvm_runtime_executor.h     |  2 ++
 .../runtime/program_impls/llvm/llvm_program.h   |  4 ++++
 3 files changed, 23 insertions(+)

diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 02a7148084b3b..6d087696ef71b 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -112,6 +112,15 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
   }
 #endif
 
+#if define(TI_WITH_AMDGPU)
+  if (config.arch == Arch::amdgpu) {
+    AMDGPUContext::get_instance().set_debug(config.debug);
+    device_ = std::make_shared<amdgpu::AMDGPUDevice>();
+
+    this->maybe_initialize_amdgpu_llvm_context();
+  }
+#endif
+
 #ifdef TI_WITH_DX12
   if (config.arch == Arch::dx12) {
     // FIXME: add dx12 device.
@@ -149,6 +158,14 @@ void LlvmRuntimeExecutor::maybe_initialize_cuda_llvm_context() {
   }
 }
 
+void LlvmRuntimeExecutor::maybe_initialize_amdgpu_llvm_context() {
+  if (config_->arch == Arch::amdgpu && llvm_context_device_ == nullptr) {
+    llvm_context_device_ =
+        std::make_unique<TaichiLLVMContext>(config_, Arch::amdgpu);
+    llvm_context_device_->init_runtime_jit_module();
+  }
+}
+
 void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager,
                                                   uint64 *result_buffer) {
   auto list_manager_len = runtime_query<int32>("ListManager_get_num_elements",
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.h b/taichi/runtime/llvm/llvm_runtime_executor.h
index 85cc3def851e0..f9f5a6e6cc7cb 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.h
+++ b/taichi/runtime/llvm/llvm_runtime_executor.h
@@ -129,6 +129,8 @@ class LlvmRuntimeExecutor {
    */
   void maybe_initialize_cuda_llvm_context();
 
+  void maybe_initialize_amdgpu_llvm_context();
+
   void finalize();
 
   uint64 fetch_result_uint64(int i, uint64 *result_buffer);
diff --git a/taichi/runtime/program_impls/llvm/llvm_program.h b/taichi/runtime/program_impls/llvm/llvm_program.h
index 5e16dc3ea57f0..a8a71b06c7853 100644
--- a/taichi/runtime/program_impls/llvm/llvm_program.h
+++ b/taichi/runtime/program_impls/llvm/llvm_program.h
@@ -159,6 +159,10 @@ class LlvmProgramImpl : public ProgramImpl {
     runtime_exec_->maybe_initialize_cuda_llvm_context();
   }
 
+  void maybe_initialize_amdgpu_llvm_context() {
+    runtime_exec_->maybe_initialize_amdgpu_llvm_context();
+  }
+
   uint64 fetch_result_uint64(int i, uint64 *result_buffer) override {
     return runtime_exec_->fetch_result_uint64(i, result_buffer);
   }

From 3546946e2b4e81c626c91fcb2ef4b97c4b9dc3c0 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Mon, 31 Oct 2022 21:36:57 +0800
Subject: [PATCH 03/12] add update addrspace module

---
 taichi/runtime/llvm/llvm_context.cpp | 83 +++++++++++++++++++---------
 1 file changed, 58 insertions(+), 25 deletions(-)

diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index f8e3a1c8ee4f3..b6057e96fe343 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -15,6 +15,9 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
+#ifdef TI_WITH_AMDGPU
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#endif //TI_WITH_AMDGPU
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -334,23 +337,8 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
   auto ctx = get_this_thread_context();
   std::unique_ptr<llvm::Module> module = module_from_bitcode_file(
       fmt::format("{}/{}", runtime_lib_dir(), file), ctx);
-  if (arch_ == Arch::cuda) {
-    module->setTargetTriple("nvptx64-nvidia-cuda");
-
-#if defined(TI_WITH_CUDA)
-    auto func = module->getFunction("cuda_compute_capability");
-    if (func) {
-      func->deleteBody();
-      auto bb = llvm::BasicBlock::Create(*ctx, "entry", func);
-      IRBuilder<> builder(*ctx);
-      builder.SetInsertPoint(bb);
-      builder.CreateRet(
-          get_constant(CUDAContext::get_instance().get_compute_capability()));
-      TaichiLLVMContext::mark_inline(func);
-    }
-#endif
-
-    auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin,
+  if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) {
+      auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin,
                                bool ret = true,
                                std::vector<llvm::Type *> types = {},
                                std::vector<llvm::Value *> extra_args = {}) {
@@ -399,6 +387,28 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
       TaichiLLVMContext::mark_inline(func);
     };
 
+    patch_atomic_add("atomic_add_i32", llvm::AtomicRMWInst::Add);
+    patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add);
+    patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd);
+    patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);
+  }
+
+  if (arch_ == Arch::cuda) {
+    module->setTargetTriple("nvptx64-nvidia-cuda");
+
+#if defined(TI_WITH_CUDA)
+    auto func = module->getFunction("cuda_compute_capability");
+    if (func) {
+      func->deleteBody();
+      auto bb = llvm::BasicBlock::Create(*ctx, "entry", func);
+      IRBuilder<> builder(*ctx);
+      builder.SetInsertPoint(bb);
+      builder.CreateRet(
+          get_constant(CUDAContext::get_instance().get_compute_capability()));
+      TaichiLLVMContext::mark_inline(func);
+    }
+#endif
+
     patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x);
     patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64);
     patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
@@ -464,14 +474,6 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
     patch_intrinsic("cttz_i32", Intrinsic::cttz, true,
                     {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
 
-    patch_atomic_add("atomic_add_i32", llvm::AtomicRMWInst::Add);
-
-    patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add);
-
-    patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);
-
-    patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd);
-
     patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
 
     link_module_with_cuda_libdevice(module);
@@ -488,6 +490,37 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
     // runtime_module->print(llvm::errs(), nullptr);
   }
 
+  if (arch_ == Arch::amdgpu) {
+    module->setTargetTriple("amdgcn-amd-amdhsa");
+    for (auto &f : *module) {
+       f.addFnAttr("target-cpu","");
+       f.addFnAttr("target-features","");
+      for (auto &bb: f) {
+        std::vector<llvm::AllocaInst*> alloca_inst_vec;
+        for (llvm::Instruction &inst : bb) {
+            llvm::AllocaInst* now_alloca = llvm::dyn_cast<AllocaInst>(&inst);
+            if (!now_alloca || 
+                now_alloca->getType()->getAddressSpace() != (unsigned)0) {
+              continue;
+            }
+            alloca_inst_vec.push_back(now_alloca);
+        }
+        for (auto &allocainst : alloca_inst_vec) {
+            auto alloca_type = allocainst->getAllocatedType();
+            llvm::IRBuilder<> builder(allocainst);
+            auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
+            auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
+            new_alloca->setAlignment(llvm::Align(allocainst->getAlignment()));
+            auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type);
+            allocainst->replaceAllUsesWith(addrspacecast);
+            allocainst->eraseFromParent();
+        }
+      }
+    }
+    patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x);
+    patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x);
+  }
+
   return module;
 }
 

From a0ec473d6cd5ade8b1a98ddeac8a04a5afbc4953 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 1 Nov 2022 00:45:36 +0800
Subject: [PATCH 04/12] fix scope

---
 taichi/runtime/llvm/llvm_context.cpp | 245 ++++++++++++++-------------
 1 file changed, 123 insertions(+), 122 deletions(-)

diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index b6057e96fe343..87f25433b15f4 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -391,134 +391,135 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
     patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add);
     patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd);
     patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);
-  }
-
-  if (arch_ == Arch::cuda) {
-    module->setTargetTriple("nvptx64-nvidia-cuda");
-
-#if defined(TI_WITH_CUDA)
-    auto func = module->getFunction("cuda_compute_capability");
-    if (func) {
-      func->deleteBody();
-      auto bb = llvm::BasicBlock::Create(*ctx, "entry", func);
-      IRBuilder<> builder(*ctx);
-      builder.SetInsertPoint(bb);
-      builder.CreateRet(
-          get_constant(CUDAContext::get_instance().get_compute_capability()));
-      TaichiLLVMContext::mark_inline(func);
-    }
-#endif
-
-    patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x);
-    patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64);
-    patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
-    patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x);
-    patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
-    patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false);
-    patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false);
-    patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
-    patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false);
-    patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false);
-
-    patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all);
-    patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync);
-
-    patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any);
-    patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync);
-
-    patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni);
-    patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync);
-
-    patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot);
-    patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync);
-
-    patch_intrinsic("cuda_shfl_down_sync_i32",
-                    Intrinsic::nvvm_shfl_sync_down_i32);
-    patch_intrinsic("cuda_shfl_down_sync_f32",
-                    Intrinsic::nvvm_shfl_sync_down_f32);
-
-    patch_intrinsic("cuda_shfl_up_sync_i32", Intrinsic::nvvm_shfl_sync_up_i32);
-    patch_intrinsic("cuda_shfl_up_sync_f32", Intrinsic::nvvm_shfl_sync_up_f32);
-
-    patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32);
-
-    patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32);
-
-    patch_intrinsic("cuda_shfl_xor_sync_i32",
-                    Intrinsic::nvvm_shfl_sync_bfly_i32);
-
-    patch_intrinsic("cuda_match_any_sync_i32",
-                    Intrinsic::nvvm_match_any_sync_i32);
-
-    // LLVM 10.0.0 seems to have a bug on this intrinsic function
-    /*
-    nvvm_match_all_sync_i32
-    Args:
-        1. u32 mask
-        2. i32 value
-        3. i32 *pred
-    */
-    /*
-    patch_intrinsic("cuda_match_all_sync_i32p",
-                    Intrinsic::nvvm_math_all_sync_i32);
-    */
-
-    // LLVM 10.0.0 seems to have a bug on this intrinsic function
-    /*
-    patch_intrinsic("cuda_match_any_sync_i64",
-                    Intrinsic::nvvm_match_any_sync_i64);
-                    */
-
-    patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true,
-                    {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
-    patch_intrinsic("cttz_i32", Intrinsic::cttz, true,
-                    {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
-
-    patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
-
-    link_module_with_cuda_libdevice(module);
-
-    // To prevent potential symbol name conflicts, we use "cuda_vprintf"
-    // instead of "vprintf" in llvm/runtime.cpp. Now we change it back for
-    // linking
-    for (auto &f : *module) {
-      if (f.getName() == "cuda_vprintf") {
-        f.setName("vprintf");
+  
+
+    if (arch_ == Arch::cuda) {
+      module->setTargetTriple("nvptx64-nvidia-cuda");
+
+  #if defined(TI_WITH_CUDA)
+      auto func = module->getFunction("cuda_compute_capability");
+      if (func) {
+        func->deleteBody();
+        auto bb = llvm::BasicBlock::Create(*ctx, "entry", func);
+        IRBuilder<> builder(*ctx);
+        builder.SetInsertPoint(bb);
+        builder.CreateRet(
+            get_constant(CUDAContext::get_instance().get_compute_capability()));
+        TaichiLLVMContext::mark_inline(func);
+      }
+  #endif
+
+      patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x);
+      patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64);
+      patch_intrinsic("block_idx", Intrinsic::nvvm_read_ptx_sreg_ctaid_x);
+      patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x);
+      patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
+      patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false);
+      patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false);
+      patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
+      patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false);
+      patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false);
+
+      patch_intrinsic("cuda_all", Intrinsic::nvvm_vote_all);
+      patch_intrinsic("cuda_all_sync", Intrinsic::nvvm_vote_all_sync);
+
+      patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any);
+      patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync);
+
+      patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni);
+      patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync);
+
+      patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot);
+      patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync);
+
+      patch_intrinsic("cuda_shfl_down_sync_i32",
+                      Intrinsic::nvvm_shfl_sync_down_i32);
+      patch_intrinsic("cuda_shfl_down_sync_f32",
+                      Intrinsic::nvvm_shfl_sync_down_f32);
+
+      patch_intrinsic("cuda_shfl_up_sync_i32", Intrinsic::nvvm_shfl_sync_up_i32);
+      patch_intrinsic("cuda_shfl_up_sync_f32", Intrinsic::nvvm_shfl_sync_up_f32);
+
+      patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32);
+
+      patch_intrinsic("cuda_shfl_sync_f32", Intrinsic::nvvm_shfl_sync_idx_f32);
+
+      patch_intrinsic("cuda_shfl_xor_sync_i32",
+                      Intrinsic::nvvm_shfl_sync_bfly_i32);
+
+      patch_intrinsic("cuda_match_any_sync_i32",
+                      Intrinsic::nvvm_match_any_sync_i32);
+
+      // LLVM 10.0.0 seems to have a bug on this intrinsic function
+      /*
+      nvvm_match_all_sync_i32
+      Args:
+          1. u32 mask
+          2. i32 value
+          3. i32 *pred
+      */
+      /*
+      patch_intrinsic("cuda_match_all_sync_i32p",
+                      Intrinsic::nvvm_math_all_sync_i32);
+      */
+
+      // LLVM 10.0.0 seems to have a bug on this intrinsic function
+      /*
+      patch_intrinsic("cuda_match_any_sync_i64",
+                      Intrinsic::nvvm_match_any_sync_i64);
+                      */
+
+      patch_intrinsic("ctlz_i32", Intrinsic::ctlz, true,
+                      {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
+      patch_intrinsic("cttz_i32", Intrinsic::cttz, true,
+                      {llvm::Type::getInt32Ty(*ctx)}, {get_constant(false)});
+
+      patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
+
+      link_module_with_cuda_libdevice(module);
+
+      // To prevent potential symbol name conflicts, we use "cuda_vprintf"
+      // instead of "vprintf" in llvm/runtime.cpp. Now we change it back for
+      // linking
+      for (auto &f : *module) {
+        if (f.getName() == "cuda_vprintf") {
+          f.setName("vprintf");
+        }
       }
-    }
 
-    // runtime_module->print(llvm::errs(), nullptr);
-  }
+      // runtime_module->print(llvm::errs(), nullptr);
+    }
 
-  if (arch_ == Arch::amdgpu) {
-    module->setTargetTriple("amdgcn-amd-amdhsa");
-    for (auto &f : *module) {
-       f.addFnAttr("target-cpu","");
-       f.addFnAttr("target-features","");
-      for (auto &bb: f) {
-        std::vector<llvm::AllocaInst*> alloca_inst_vec;
-        for (llvm::Instruction &inst : bb) {
-            llvm::AllocaInst* now_alloca = llvm::dyn_cast<AllocaInst>(&inst);
-            if (!now_alloca || 
-                now_alloca->getType()->getAddressSpace() != (unsigned)0) {
-              continue;
-            }
-            alloca_inst_vec.push_back(now_alloca);
-        }
-        for (auto &allocainst : alloca_inst_vec) {
-            auto alloca_type = allocainst->getAllocatedType();
-            llvm::IRBuilder<> builder(allocainst);
-            auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
-            auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
-            new_alloca->setAlignment(llvm::Align(allocainst->getAlignment()));
-            auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type);
-            allocainst->replaceAllUsesWith(addrspacecast);
-            allocainst->eraseFromParent();
+    if (arch_ == Arch::amdgpu) {
+      module->setTargetTriple("amdgcn-amd-amdhsa");
+      for (auto &f : *module) {
+        f.addFnAttr("target-cpu","");
+        f.addFnAttr("target-features","");
+        for (auto &bb: f) {
+          std::vector<llvm::AllocaInst*> alloca_inst_vec;
+          for (llvm::Instruction &inst : bb) {
+              llvm::AllocaInst* now_alloca = llvm::dyn_cast<AllocaInst>(&inst);
+              if (!now_alloca || 
+                  now_alloca->getType()->getAddressSpace() != (unsigned)0) {
+                continue;
+              }
+              alloca_inst_vec.push_back(now_alloca);
+          }
+          for (auto &allocainst : alloca_inst_vec) {
+              auto alloca_type = allocainst->getAllocatedType();
+              llvm::IRBuilder<> builder(allocainst);
+              auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
+              auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
+              new_alloca->setAlignment(llvm::Align(allocainst->getAlignment()));
+              auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type);
+              allocainst->replaceAllUsesWith(addrspacecast);
+              allocainst->eraseFromParent();
+          }
         }
       }
+      patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x);
+      patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x);
     }
-    patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x);
-    patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x);
   }
 
   return module;

From 7fd4225003e58cba22f657466c5806c46fcc32cc Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 1 Nov 2022 01:40:06 +0800
Subject: [PATCH 05/12] add macro control

---
 taichi/runtime/llvm/llvm_context.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index 87f25433b15f4..6e87b8d7f16fa 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -338,6 +338,7 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
   std::unique_ptr<llvm::Module> module = module_from_bitcode_file(
       fmt::format("{}/{}", runtime_lib_dir(), file), ctx);
   if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) {
+#if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU)
       auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin,
                                bool ret = true,
                                std::vector<llvm::Type *> types = {},
@@ -391,8 +392,8 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
     patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add);
     patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd);
     patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);
+#endif
   
-
     if (arch_ == Arch::cuda) {
       module->setTargetTriple("nvptx64-nvidia-cuda");
 
@@ -492,6 +493,7 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
 
     if (arch_ == Arch::amdgpu) {
       module->setTargetTriple("amdgcn-amd-amdhsa");
+#ifdef TI_WITH_AMDGPU
       for (auto &f : *module) {
         f.addFnAttr("target-cpu","");
         f.addFnAttr("target-features","");
@@ -519,6 +521,7 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
       }
       patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x);
       patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x);
+#endif
     }
   }
 

From 82b4ccb3f14e952864029d848d31f2c571af8566 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 1 Nov 2022 01:52:05 +0800
Subject: [PATCH 06/12] del extra control

---
 taichi/runtime/llvm/llvm_context.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index 6e87b8d7f16fa..94cb364339038 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -338,7 +338,6 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
   std::unique_ptr<llvm::Module> module = module_from_bitcode_file(
       fmt::format("{}/{}", runtime_lib_dir(), file), ctx);
   if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) {
-#if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU)
       auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin,
                                bool ret = true,
                                std::vector<llvm::Type *> types = {},
@@ -392,7 +391,6 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
     patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add);
     patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd);
     patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);
-#endif
   
     if (arch_ == Arch::cuda) {
       module->setTargetTriple("nvptx64-nvidia-cuda");

From d30e24b96a243d8fa76a7b582c4c082aa7ac8cdf Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Tue, 1 Nov 2022 01:56:40 +0800
Subject: [PATCH 07/12] fix typo

---
 taichi/runtime/llvm/llvm_runtime_executor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 6d087696ef71b..1fbddac1b3a05 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -112,7 +112,7 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
   }
 #endif
 
-#if define(TI_WITH_AMDGPU)
+#if defined(TI_WITH_AMDGPU)
   if (config.arch == Arch::amdgpu) {
     AMDGPUContext::get_instance().set_debug(config.debug);
     device_ = std::make_shared<amdgpu::AMDGPUDevice>();

From 660b98de492de189a39a5b4703e9ca775648e373 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 31 Oct 2022 17:57:53 +0000
Subject: [PATCH 08/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 taichi/runtime/llvm/llvm_context.cpp | 88 +++++++++++++++-------------
 1 file changed, 48 insertions(+), 40 deletions(-)

diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index 94cb364339038..4c1af07203e5a 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -17,7 +17,7 @@
 #include "llvm/IR/IntrinsicsNVPTX.h"
 #ifdef TI_WITH_AMDGPU
 #include "llvm/IR/IntrinsicsAMDGPU.h"
-#endif //TI_WITH_AMDGPU
+#endif  // TI_WITH_AMDGPU
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
@@ -338,7 +338,7 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
   std::unique_ptr<llvm::Module> module = module_from_bitcode_file(
       fmt::format("{}/{}", runtime_lib_dir(), file), ctx);
   if (arch_ == Arch::cuda || arch_ == Arch::amdgpu) {
-      auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin,
+    auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin,
                                bool ret = true,
                                std::vector<llvm::Type *> types = {},
                                std::vector<llvm::Value *> extra_args = {}) {
@@ -391,11 +391,11 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
     patch_atomic_add("atomic_add_i64", llvm::AtomicRMWInst::Add);
     patch_atomic_add("atomic_add_f64", llvm::AtomicRMWInst::FAdd);
     patch_atomic_add("atomic_add_f32", llvm::AtomicRMWInst::FAdd);
-  
+
     if (arch_ == Arch::cuda) {
       module->setTargetTriple("nvptx64-nvidia-cuda");
 
-  #if defined(TI_WITH_CUDA)
+#if defined(TI_WITH_CUDA)
       auto func = module->getFunction("cuda_compute_capability");
       if (func) {
         func->deleteBody();
@@ -406,7 +406,7 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
             get_constant(CUDAContext::get_instance().get_compute_capability()));
         TaichiLLVMContext::mark_inline(func);
       }
-  #endif
+#endif
 
       patch_intrinsic("thread_idx", Intrinsic::nvvm_read_ptx_sreg_tid_x);
       patch_intrinsic("cuda_clock_i64", Intrinsic::nvvm_read_ptx_sreg_clock64);
@@ -436,8 +436,10 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
       patch_intrinsic("cuda_shfl_down_sync_f32",
                       Intrinsic::nvvm_shfl_sync_down_f32);
 
-      patch_intrinsic("cuda_shfl_up_sync_i32", Intrinsic::nvvm_shfl_sync_up_i32);
-      patch_intrinsic("cuda_shfl_up_sync_f32", Intrinsic::nvvm_shfl_sync_up_f32);
+      patch_intrinsic("cuda_shfl_up_sync_i32",
+                      Intrinsic::nvvm_shfl_sync_up_i32);
+      patch_intrinsic("cuda_shfl_up_sync_f32",
+                      Intrinsic::nvvm_shfl_sync_up_f32);
 
       patch_intrinsic("cuda_shfl_sync_i32", Intrinsic::nvvm_shfl_sync_idx_i32);
 
@@ -493,27 +495,28 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
       module->setTargetTriple("amdgcn-amd-amdhsa");
 #ifdef TI_WITH_AMDGPU
       for (auto &f : *module) {
-        f.addFnAttr("target-cpu","");
-        f.addFnAttr("target-features","");
-        for (auto &bb: f) {
-          std::vector<llvm::AllocaInst*> alloca_inst_vec;
+        f.addFnAttr("target-cpu", "");
+        f.addFnAttr("target-features", "");
+        for (auto &bb : f) {
+          std::vector<llvm::AllocaInst *> alloca_inst_vec;
           for (llvm::Instruction &inst : bb) {
-              llvm::AllocaInst* now_alloca = llvm::dyn_cast<AllocaInst>(&inst);
-              if (!now_alloca || 
-                  now_alloca->getType()->getAddressSpace() != (unsigned)0) {
-                continue;
-              }
-              alloca_inst_vec.push_back(now_alloca);
+            llvm::AllocaInst *now_alloca = llvm::dyn_cast<AllocaInst>(&inst);
+            if (!now_alloca ||
+                now_alloca->getType()->getAddressSpace() != (unsigned)0) {
+              continue;
+            }
+            alloca_inst_vec.push_back(now_alloca);
           }
           for (auto &allocainst : alloca_inst_vec) {
-              auto alloca_type = allocainst->getAllocatedType();
-              llvm::IRBuilder<> builder(allocainst);
-              auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
-              auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
-              new_alloca->setAlignment(llvm::Align(allocainst->getAlignment()));
-              auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type);
-              allocainst->replaceAllUsesWith(addrspacecast);
-              allocainst->eraseFromParent();
+            auto alloca_type = allocainst->getAllocatedType();
+            llvm::IRBuilder<> builder(allocainst);
+            auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
+            auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
+            new_alloca->setAlignment(llvm::Align(allocainst->getAlignment()));
+            auto *addrspacecast =
+                builder.CreateAddrSpaceCast(new_alloca, new_type);
+            allocainst->replaceAllUsesWith(addrspacecast);
+            allocainst->eraseFromParent();
           }
         }
       }
@@ -857,7 +860,7 @@ void TaichiLLVMContext::update_runtime_jit_module(
     }
   }
 
-    if (arch_ == Arch::amdgpu) {
+  if (arch_ == Arch::amdgpu) {
     for (auto &f : *module) {
       bool is_kernel = false;
       const std::string func_name = f.getName().str();
@@ -875,35 +878,40 @@ void TaichiLLVMContext::update_runtime_jit_module(
     }
     for (auto &f : global_func) {
       llvm::FunctionType *func_type = f->getFunctionType();
-      std::vector<llvm::Type*> new_func_params;
+      std::vector<llvm::Type *> new_func_params;
       for (auto &arg : f->args()) {
         if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
-          auto new_type = llvm::PointerType::get(arg.getType()->getPointerElementType(), unsigned(1));
+          auto new_type = llvm::PointerType::get(
+              arg.getType()->getPointerElementType(), unsigned(1));
           new_func_params.push_back(new_type);
-        }
-        else {
+        } else {
           new_func_params.push_back(arg.getType());
         }
-      } 
-      auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), new_func_params, false);
-      auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), f->getAddressSpace());
-      //NF->copyAttributesFrom(f);
+      }
+      auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(),
+                                                   new_func_params, false);
+      auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(),
+                                             f->getAddressSpace());
+      // NF->copyAttributesFrom(f);
       new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
       new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
       new_func->setComdat(f->getComdat());
       f->getParent()->getFunctionList().insert(f->getIterator(), new_func);
       new_func->takeName(f);
-      new_func->getBasicBlockList().splice(new_func->begin(), f->getBasicBlockList());
+      new_func->getBasicBlockList().splice(new_func->begin(),
+                                           f->getBasicBlockList());
       for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
-                                  I2 = new_func->arg_begin(); I != E; ++I, ++I2) {
+                                        I2 = new_func->arg_begin();
+           I != E; ++I, ++I2) {
         if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
           auto &front_bb = new_func->getBasicBlockList().front();
-          llvm::Instruction *addrspacecast = new AddrSpaceCastInst(I2, I->getType());
-          front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), addrspacecast);
+          llvm::Instruction *addrspacecast =
+              new AddrSpaceCastInst(I2, I->getType());
+          front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(),
+                                             addrspacecast);
           I->replaceAllUsesWith(addrspacecast);
           I2->takeName(&*I);
-        } 
-        else {
+        } else {
           I->replaceAllUsesWith(&*I2);
           I2->takeName(&*I);
         }

From 1e8c1f465309e927c911ec4ddb8d2a2fc9e7cd7c Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Thu, 22 Dec 2022 21:20:07 +0800
Subject: [PATCH 09/12] update pass to llvm_pass

---
 taichi/runtime/llvm/llvm_context.cpp    | 105 +++-----------------
 taichi/runtime/llvm/llvm_context.h      |   2 -
 taichi/runtime/llvm/llvm_context_pass.h | 126 ++++++++++++++++++++++++
 3 files changed, 139 insertions(+), 94 deletions(-)
 create mode 100644 taichi/runtime/llvm/llvm_context_pass.h

diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index 4c1af07203e5a..beb0e6309feaa 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -50,6 +50,10 @@
 #include "llvm_context.h"
 #include "taichi/runtime/program_impls/llvm/llvm_program.h"
 #include "taichi/codegen/codegen_utils.h"
+#ifdef TI_WITH_AMDGPU
+#include "taichi/runtime/llvm/llvm_context_pass.h"
+#endif
+
 
 #ifdef _WIN32
 // Travis CI seems doesn't support <filesystem>...
@@ -494,32 +498,13 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::module_from_file(
     if (arch_ == Arch::amdgpu) {
       module->setTargetTriple("amdgcn-amd-amdhsa");
 #ifdef TI_WITH_AMDGPU
-      for (auto &f : *module) {
-        f.addFnAttr("target-cpu", "");
-        f.addFnAttr("target-features", "");
-        for (auto &bb : f) {
-          std::vector<llvm::AllocaInst *> alloca_inst_vec;
-          for (llvm::Instruction &inst : bb) {
-            llvm::AllocaInst *now_alloca = llvm::dyn_cast<AllocaInst>(&inst);
-            if (!now_alloca ||
-                now_alloca->getType()->getAddressSpace() != (unsigned)0) {
-              continue;
-            }
-            alloca_inst_vec.push_back(now_alloca);
-          }
-          for (auto &allocainst : alloca_inst_vec) {
-            auto alloca_type = allocainst->getAllocatedType();
-            llvm::IRBuilder<> builder(allocainst);
-            auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
-            auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
-            new_alloca->setAlignment(llvm::Align(allocainst->getAlignment()));
-            auto *addrspacecast =
-                builder.CreateAddrSpaceCast(new_alloca, new_type);
-            allocainst->replaceAllUsesWith(addrspacecast);
-            allocainst->eraseFromParent();
-          }
-        }
+      llvm::legacy::FunctionPassManager function_pass_manager(module.get());
+      function_pass_manager.add(new AMDGPUConvertAllocaInstAddressSpacePass());
+      function_pass_manager.doInitialization();
+      for (auto func = module->begin(); func != module->end(); ++func) {
+        function_pass_manager.run(*func);
       }
+      function_pass_manager.doFinalization();
       patch_intrinsic("thread_idx", llvm::Intrinsic::amdgcn_workitem_id_x);
       patch_intrinsic("block_idx", llvm::Intrinsic::amdgcn_workgroup_id_x);
 #endif
@@ -749,11 +734,6 @@ void TaichiLLVMContext::mark_function_as_cuda_kernel(llvm::Function *func,
   }
 }
 
-void TaichiLLVMContext::mark_function_as_amdgpu_kernel(llvm::Function *func) {
-  func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
-  func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
-}
-
 void TaichiLLVMContext::eliminate_unused_functions(
     llvm::Module *module,
     std::function<bool(const std::string &)> export_indicator) {
@@ -861,68 +841,9 @@ void TaichiLLVMContext::update_runtime_jit_module(
   }
 
   if (arch_ == Arch::amdgpu) {
-    for (auto &f : *module) {
-      bool is_kernel = false;
-      const std::string func_name = f.getName().str();
-      if (starts_with(func_name, "runtime_")) {
-        mark_function_as_amdgpu_kernel(&f);
-        is_kernel = true;
-      }
-      if (!is_kernel && !f.isDeclaration())
-        f.setLinkage(llvm::Function::PrivateLinkage);
-    }
-    std::vector<llvm::Function *> global_func;
-    for (auto &f : *module) {
-      if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL)
-        global_func.push_back(&f);
-    }
-    for (auto &f : global_func) {
-      llvm::FunctionType *func_type = f->getFunctionType();
-      std::vector<llvm::Type *> new_func_params;
-      for (auto &arg : f->args()) {
-        if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
-          auto new_type = llvm::PointerType::get(
-              arg.getType()->getPointerElementType(), unsigned(1));
-          new_func_params.push_back(new_type);
-        } else {
-          new_func_params.push_back(arg.getType());
-        }
-      }
-      auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(),
-                                                   new_func_params, false);
-      auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(),
-                                             f->getAddressSpace());
-      // NF->copyAttributesFrom(f);
-      new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
-      new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
-      new_func->setComdat(f->getComdat());
-      f->getParent()->getFunctionList().insert(f->getIterator(), new_func);
-      new_func->takeName(f);
-      new_func->getBasicBlockList().splice(new_func->begin(),
-                                           f->getBasicBlockList());
-      for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
-                                        I2 = new_func->arg_begin();
-           I != E; ++I, ++I2) {
-        if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
-          auto &front_bb = new_func->getBasicBlockList().front();
-          llvm::Instruction *addrspacecast =
-              new AddrSpaceCastInst(I2, I->getType());
-          front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(),
-                                             addrspacecast);
-          I->replaceAllUsesWith(addrspacecast);
-          I2->takeName(&*I);
-        } else {
-          I->replaceAllUsesWith(&*I2);
-          I2->takeName(&*I);
-        }
-      }
-
-      SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
-      f->getAllMetadata(MDs);
-      for (auto [KindID, Node] : MDs)
-        new_func->addMetadata(KindID, *Node);
-      f->eraseFromParent();
-    }
+    llvm::legacy::PassManager module_pass_manager;
+    module_pass_manager.add(new AMDGPUConvertFuncParamAddressSpacePass());
+    module_pass_manager.run(*module);
   }
 
   eliminate_unused_functions(module.get(), [](std::string func_name) {
diff --git a/taichi/runtime/llvm/llvm_context.h b/taichi/runtime/llvm/llvm_context.h
index 58ca21f68c0d6..ddea66efd763d 100644
--- a/taichi/runtime/llvm/llvm_context.h
+++ b/taichi/runtime/llvm/llvm_context.h
@@ -125,8 +125,6 @@ class TaichiLLVMContext {
 
   void mark_function_as_cuda_kernel(llvm::Function *func, int block_dim = 0);
 
-  void mark_function_as_amdgpu_kernel(llvm::Function *func);
-
   void fetch_this_thread_struct_module();
   llvm::Module *get_this_thread_runtime_module();
   llvm::Function *get_runtime_function(const std::string &name);
diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h
new file mode 100644
index 0000000000000..c6d2cd7a64ada
--- /dev/null
+++ b/taichi/runtime/llvm/llvm_context_pass.h
@@ -0,0 +1,126 @@
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/Host.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IRBuilder.h"
+
+#if defined(TI_WITH_AMDGPU)
+#include "taichi/rhi/amdgpu/amdgpu_context.h"
+#endif
+
+namespace taichi {
+namespace lang {
+using namespace llvm;
+struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass {
+    static char ID;
+    AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) {}
+    bool runOnFunction(llvm::Function &f) override {
+        f.addFnAttr("target-cpu", "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3,4));
+        f.addFnAttr("target-features","");
+        for (auto &bb: f) {
+            std::vector<AllocaInst*> alloca_inst_vec;
+            for (Instruction &inst : bb) {
+                AllocaInst* now_alloca = dyn_cast<AllocaInst>(&inst);
+                if (!now_alloca || 
+                    now_alloca->getType()->getAddressSpace() != (unsigned)0) {
+                continue;
+                }
+                alloca_inst_vec.push_back(now_alloca);
+            }
+            for (auto &allocainst : alloca_inst_vec) {
+                auto alloca_type = allocainst->getAllocatedType();
+                IRBuilder<> builder(allocainst);
+                auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
+                auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
+                new_alloca->setAlignment(Align(allocainst->getAlign().value()));
+                auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type);
+                allocainst->replaceAllUsesWith(addrspacecast);
+                allocainst->eraseFromParent();
+            }
+        }
+        return false;
+    }
+};
+
+struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
+    static char ID;
+    AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) {}
+    bool runOnModule(llvm::Module &M) override {
+        for (auto &f : M) {
+        bool is_kernel = false;
+        const std::string func_name = f.getName().str();
+        if (starts_with(func_name, "runtime_")) {
+          f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+          f.addFnAttr("amdgpu-flat-work-group-size", "1, 256");
+          is_kernel = true;
+        }
+        if (!is_kernel && !f.isDeclaration())
+          f.setLinkage(llvm::Function::PrivateLinkage);
+      }
+      std::vector<llvm::Function *> global_func;
+      for (auto &f : M) {
+        if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL)
+          global_func.push_back(&f);
+      }
+      for (auto &f : global_func) {
+        llvm::FunctionType *func_type = f->getFunctionType();
+        std::vector<llvm::Type*> new_func_params;
+        for (auto &arg : f->args()) {
+          if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
+            auto new_type = llvm::PointerType::get(arg.getType()->getPointerElementType(), unsigned(1));
+            new_func_params.push_back(new_type);
+          }
+          else {
+            new_func_params.push_back(arg.getType());
+          }
+        }
+        auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), new_func_params, false);
+        auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), f->getAddressSpace());
+        new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+        new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
+        new_func->addFnAttr("target-cpu", "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3,4));
+        new_func->setComdat(f->getComdat());
+        f->getParent()->getFunctionList().insert(f->getIterator(), new_func);
+        new_func->takeName(f);
+        new_func->getBasicBlockList().splice(new_func->begin(), f->getBasicBlockList());
+        for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
+                                    I2 = new_func->arg_begin(); I != E; ++I, ++I2) {
+          if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
+            auto &front_bb = new_func->getBasicBlockList().front();
+            llvm::Instruction *addrspacecast = new AddrSpaceCastInst(I2, I->getType());
+            front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), addrspacecast);
+            I->replaceAllUsesWith(addrspacecast);
+            I2->takeName(&*I);
+          }
+          else {
+            I->replaceAllUsesWith(&*I2);
+            I2->takeName(&*I);
+          }
+        }
+
+        f->eraseFromParent();
+      }
+      return false;
+    }
+};
+
+char AMDGPUConvertAllocaInstAddressSpacePass::ID = 0;
+char AMDGPUConvertFuncParamAddressSpacePass::ID = 0;
+
+} // namespace lang
+} // namespace taichi
\ No newline at end of file

From 325732a1d1307e29337396881c0629d21898c4e6 Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Thu, 22 Dec 2022 21:28:04 +0800
Subject: [PATCH 10/12] fix bug and solve conversation

---
 taichi/runtime/llvm/llvm_context.cpp    |  2 ++
 taichi/runtime/llvm/llvm_context_pass.h | 13 +++++++++----
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index beb0e6309feaa..8cd5cf4e785dd 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -841,9 +841,11 @@ void TaichiLLVMContext::update_runtime_jit_module(
   }
 
   if (arch_ == Arch::amdgpu) {
+#ifdef TI_WITH_AMDGPU
     llvm::legacy::PassManager module_pass_manager;
     module_pass_manager.add(new AMDGPUConvertFuncParamAddressSpacePass());
     module_pass_manager.run(*module);
+#endif
   }
 
   eliminate_unused_functions(module.get(), [](std::string func_name) {
diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h
index c6d2cd7a64ada..b1a09325c4ea5 100644
--- a/taichi/runtime/llvm/llvm_context_pass.h
+++ b/taichi/runtime/llvm/llvm_context_pass.h
@@ -66,18 +66,23 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
         const std::string func_name = f.getName().str();
         if (starts_with(func_name, "runtime_")) {
           f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
-          f.addFnAttr("amdgpu-flat-work-group-size", "1, 256");
+          // ref https://llvm.org/docs/AMDGPUUsage.html
+          // “amdgpu-flat-work-group-size”=”min,max”
+          // Specify the minimum and maximum flat work group sizes that will be specified when the kernel is dispatched. 
+          // Generated by the amdgpu_flat_work_group_size CLANG attribute [CLANG-ATTR]. 
+          // The implied default value is 1,1024.
+          f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
           is_kernel = true;
         }
         if (!is_kernel && !f.isDeclaration())
           f.setLinkage(llvm::Function::PrivateLinkage);
       }
-      std::vector<llvm::Function *> global_func;
+      std::vector<llvm::Function *> kernel_function;
       for (auto &f : M) {
         if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL)
-          global_func.push_back(&f);
+          kernel_function.push_back(&f);
       }
-      for (auto &f : global_func) {
+      for (auto &f : kernel_function) {
         llvm::FunctionType *func_type = f->getFunctionType();
         std::vector<llvm::Type*> new_func_params;
         for (auto &arg : f->args()) {

From bd6b58d54232bcacdf98eab444c2c362e30f852a Mon Sep 17 00:00:00 2001
From: zeyuli <li_zeyu@pku.edu.cn>
Date: Fri, 23 Dec 2022 23:21:14 +0800
Subject: [PATCH 11/12] del extra header file in llvm_context_pass.h

---
 taichi/runtime/llvm/llvm_context_pass.h | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h
index b1a09325c4ea5..33006812ebeb6 100644
--- a/taichi/runtime/llvm/llvm_context_pass.h
+++ b/taichi/runtime/llvm/llvm_context_pass.h
@@ -1,19 +1,10 @@
-#include "llvm/Analysis/TargetTransformInfo.h"
+#pragma once
+  
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Linker/Linker.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/Host.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -26,6 +17,7 @@
 namespace taichi {
 namespace lang {
 using namespace llvm;
+#if defined(TI_WITH_AMDGPU)
 struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass {
     static char ID;
     AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) {}
@@ -126,6 +118,7 @@ struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
 
 char AMDGPUConvertAllocaInstAddressSpacePass::ID = 0;
 char AMDGPUConvertFuncParamAddressSpacePass::ID = 0;
+#endif
 
 } // namespace lang
-} // namespace taichi
\ No newline at end of file
+} // namespace taichi

From c015143617602a4ce2a42e17e685640fa4354a65 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 23 Dec 2022 15:22:38 +0000
Subject: [PATCH 12/12] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 taichi/runtime/llvm/llvm_context.cpp    |   1 -
 taichi/runtime/llvm/llvm_context_pass.h | 187 +++++++++++++-----------
 2 files changed, 99 insertions(+), 89 deletions(-)

diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index 8cd5cf4e785dd..31ee42da2791f 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -54,7 +54,6 @@
 #include "taichi/runtime/llvm/llvm_context_pass.h"
 #endif
 
-
 #ifdef _WIN32
 // Travis CI seems doesn't support <filesystem>...
 #include <filesystem>
diff --git a/taichi/runtime/llvm/llvm_context_pass.h b/taichi/runtime/llvm/llvm_context_pass.h
index 33006812ebeb6..d48303dbc6670 100644
--- a/taichi/runtime/llvm/llvm_context_pass.h
+++ b/taichi/runtime/llvm/llvm_context_pass.h
@@ -1,5 +1,5 @@
 #pragma once
-  
+
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Function.h"
@@ -19,106 +19,117 @@ namespace lang {
 using namespace llvm;
 #if defined(TI_WITH_AMDGPU)
 struct AMDGPUConvertAllocaInstAddressSpacePass : public FunctionPass {
-    static char ID;
-    AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) {}
-    bool runOnFunction(llvm::Function &f) override {
-        f.addFnAttr("target-cpu", "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3,4));
-        f.addFnAttr("target-features","");
-        for (auto &bb: f) {
-            std::vector<AllocaInst*> alloca_inst_vec;
-            for (Instruction &inst : bb) {
-                AllocaInst* now_alloca = dyn_cast<AllocaInst>(&inst);
-                if (!now_alloca || 
-                    now_alloca->getType()->getAddressSpace() != (unsigned)0) {
-                continue;
-                }
-                alloca_inst_vec.push_back(now_alloca);
-            }
-            for (auto &allocainst : alloca_inst_vec) {
-                auto alloca_type = allocainst->getAllocatedType();
-                IRBuilder<> builder(allocainst);
-                auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
-                auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
-                new_alloca->setAlignment(Align(allocainst->getAlign().value()));
-                auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type);
-                allocainst->replaceAllUsesWith(addrspacecast);
-                allocainst->eraseFromParent();
-            }
+  static char ID;
+  AMDGPUConvertAllocaInstAddressSpacePass() : FunctionPass(ID) {
+  }
+  bool runOnFunction(llvm::Function &f) override {
+    f.addFnAttr("target-cpu",
+                "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4));
+    f.addFnAttr("target-features", "");
+    for (auto &bb : f) {
+      std::vector<AllocaInst *> alloca_inst_vec;
+      for (Instruction &inst : bb) {
+        AllocaInst *now_alloca = dyn_cast<AllocaInst>(&inst);
+        if (!now_alloca ||
+            now_alloca->getType()->getAddressSpace() != (unsigned)0) {
+          continue;
         }
-        return false;
+        alloca_inst_vec.push_back(now_alloca);
+      }
+      for (auto &allocainst : alloca_inst_vec) {
+        auto alloca_type = allocainst->getAllocatedType();
+        IRBuilder<> builder(allocainst);
+        auto *new_alloca = builder.CreateAlloca(alloca_type, (unsigned)5);
+        auto new_type = llvm::PointerType::get(alloca_type, (unsigned)0);
+        new_alloca->setAlignment(Align(allocainst->getAlign().value()));
+        auto *addrspacecast = builder.CreateAddrSpaceCast(new_alloca, new_type);
+        allocainst->replaceAllUsesWith(addrspacecast);
+        allocainst->eraseFromParent();
+      }
     }
+    return false;
+  }
 };
 
 struct AMDGPUConvertFuncParamAddressSpacePass : public ModulePass {
-    static char ID;
-    AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) {}
-    bool runOnModule(llvm::Module &M) override {
-        for (auto &f : M) {
-        bool is_kernel = false;
-        const std::string func_name = f.getName().str();
-        if (starts_with(func_name, "runtime_")) {
-          f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
-          // ref https://llvm.org/docs/AMDGPUUsage.html
-          // “amdgpu-flat-work-group-size”=”min,max”
-          // Specify the minimum and maximum flat work group sizes that will be specified when the kernel is dispatched. 
-          // Generated by the amdgpu_flat_work_group_size CLANG attribute [CLANG-ATTR]. 
-          // The implied default value is 1,1024.
-          f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
-          is_kernel = true;
-        }
-        if (!is_kernel && !f.isDeclaration())
-          f.setLinkage(llvm::Function::PrivateLinkage);
-      }
-      std::vector<llvm::Function *> kernel_function;
-      for (auto &f : M) {
-        if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL)
-          kernel_function.push_back(&f);
+  static char ID;
+  AMDGPUConvertFuncParamAddressSpacePass() : ModulePass(ID) {
+  }
+  bool runOnModule(llvm::Module &M) override {
+    for (auto &f : M) {
+      bool is_kernel = false;
+      const std::string func_name = f.getName().str();
+      if (starts_with(func_name, "runtime_")) {
+        f.setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+        // ref https://llvm.org/docs/AMDGPUUsage.html
+        // “amdgpu-flat-work-group-size”=”min,max”
+        // Specify the minimum and maximum flat work group sizes that will be
+        // specified when the kernel is dispatched. Generated by the
+        // amdgpu_flat_work_group_size CLANG attribute [CLANG-ATTR]. The implied
+        // default value is 1,1024.
+        f.addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
+        is_kernel = true;
       }
-      for (auto &f : kernel_function) {
-        llvm::FunctionType *func_type = f->getFunctionType();
-        std::vector<llvm::Type*> new_func_params;
-        for (auto &arg : f->args()) {
-          if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
-            auto new_type = llvm::PointerType::get(arg.getType()->getPointerElementType(), unsigned(1));
-            new_func_params.push_back(new_type);
-          }
-          else {
-            new_func_params.push_back(arg.getType());
-          }
+      if (!is_kernel && !f.isDeclaration())
+        f.setLinkage(llvm::Function::PrivateLinkage);
+    }
+    std::vector<llvm::Function *> kernel_function;
+    for (auto &f : M) {
+      if (f.getCallingConv() == llvm::CallingConv::AMDGPU_KERNEL)
+        kernel_function.push_back(&f);
+    }
+    for (auto &f : kernel_function) {
+      llvm::FunctionType *func_type = f->getFunctionType();
+      std::vector<llvm::Type *> new_func_params;
+      for (auto &arg : f->args()) {
+        if (arg.getType()->getTypeID() == llvm::Type::PointerTyID) {
+          auto new_type = llvm::PointerType::get(
+              arg.getType()->getPointerElementType(), unsigned(1));
+          new_func_params.push_back(new_type);
+        } else {
+          new_func_params.push_back(arg.getType());
         }
-        auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(), new_func_params, false);
-        auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(), f->getAddressSpace());
-        new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
-        new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
-        new_func->addFnAttr("target-cpu", "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3,4));
-        new_func->setComdat(f->getComdat());
-        f->getParent()->getFunctionList().insert(f->getIterator(), new_func);
-        new_func->takeName(f);
-        new_func->getBasicBlockList().splice(new_func->begin(), f->getBasicBlockList());
-        for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
-                                    I2 = new_func->arg_begin(); I != E; ++I, ++I2) {
-          if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
-            auto &front_bb = new_func->getBasicBlockList().front();
-            llvm::Instruction *addrspacecast = new AddrSpaceCastInst(I2, I->getType());
-            front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(), addrspacecast);
-            I->replaceAllUsesWith(addrspacecast);
-            I2->takeName(&*I);
-          }
-          else {
-            I->replaceAllUsesWith(&*I2);
-            I2->takeName(&*I);
-          }
+      }
+      auto new_func_type = llvm::FunctionType::get(func_type->getReturnType(),
+                                                   new_func_params, false);
+      auto new_func = llvm::Function::Create(new_func_type, f->getLinkage(),
+                                             f->getAddressSpace());
+      new_func->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+      new_func->addFnAttr("amdgpu-flat-work-group-size", "1, 1024");
+      new_func->addFnAttr(
+          "target-cpu",
+          "gfx" + AMDGPUContext::get_instance().get_mcpu().substr(3, 4));
+      new_func->setComdat(f->getComdat());
+      f->getParent()->getFunctionList().insert(f->getIterator(), new_func);
+      new_func->takeName(f);
+      new_func->getBasicBlockList().splice(new_func->begin(),
+                                           f->getBasicBlockList());
+      for (llvm::Function::arg_iterator I = f->arg_begin(), E = f->arg_end(),
+                                        I2 = new_func->arg_begin();
+           I != E; ++I, ++I2) {
+        if (I->getType()->getTypeID() == llvm::Type::PointerTyID) {
+          auto &front_bb = new_func->getBasicBlockList().front();
+          llvm::Instruction *addrspacecast =
+              new AddrSpaceCastInst(I2, I->getType());
+          front_bb.getInstList().insertAfter(front_bb.getFirstInsertionPt(),
+                                             addrspacecast);
+          I->replaceAllUsesWith(addrspacecast);
+          I2->takeName(&*I);
+        } else {
+          I->replaceAllUsesWith(&*I2);
+          I2->takeName(&*I);
         }
-
-        f->eraseFromParent();
       }
-      return false;
+
+      f->eraseFromParent();
     }
+    return false;
+  }
 };
 
 char AMDGPUConvertAllocaInstAddressSpacePass::ID = 0;
 char AMDGPUConvertFuncParamAddressSpacePass::ID = 0;
 #endif
 
-} // namespace lang
-} // namespace taichi
+}  // namespace lang
+}  // namespace taichi