From 443c22dd839cca68fafb5d433b32eb8d923b42ca Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Wed, 21 Jun 2023 17:31:19 +0300
Subject: [PATCH] [Relay] Introduce arguments limit to FuseOps pass

In PR #8313 a parameter `max_function_args` was introduced. It leads to
limit number of function argument and in case when this value is
exceeded then concatenation layer is split to a several concat
operations.

I faced a problem on Adreno GPU that for kernel with big number of
arguments the enqueueNDRange was crashed without any errors. The problem
appeared because of the huge number of arguments. But in this case not
only concat layer was a root cause of the problem. Also after fusing
several operations the final functions had a big number of arguments.

As it was discussed in #8313, adding a limitation on the number of
function arguments to the FuseOps pass might be a good improvement. In
this PR I introduced such mechanism for limitation number of function
arguments for FuseOps pass and add an arguments limit to OpenCL devices
at 128 parameters.
---
 include/tvm/relay/transform.h                 |   5 +-
 include/tvm/topi/transform.h                  |   2 +-
 python/tvm/relay/transform/transform.py       |   8 +-
 python/tvm/target/target.py                   |   2 +-
 src/relay/analysis/graph_partitioner.cc       |  91 +++++++++
 src/relay/analysis/graph_partitioner.h        |  22 +-
 src/relay/backend/build_module.cc             |   2 +-
 src/relay/backend/vm/compiler.cc              |   7 +
 src/relay/transforms/fuse_ops.cc              |  23 ++-
 src/relay/transforms/split_args.cc            |  95 ++++++---
 src/target/source/codegen_metal.cc            |   4 +-
 src/target/target_kind.cc                     |   5 +
 tests/python/relay/test_pass_fuse_ops.py      | 130 ++++++++++++
 tests/python/relay/test_pass_split_args.py    | 103 +++++-----
 .../unittest/test_target_codegen_opencl.py    | 193 +++++++++++++++++-
 15 files changed, 601 insertions(+), 91 deletions(-)

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 5f591f1d89ad4..a79bd033c964d 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -120,9 +120,12 @@ TVM_DLL Pass FoldConstant(bool fold_qnn = false);
 /*!
  * \brief Split function with huge number of arguments to smaller pieces.
  *
+ * \param max_function_args Maximum number of function arguments. If it is 0 then SplitArgs won't
+ *                          split funciton.
+ *
  * \return The pass.
  */
-TVM_DLL Pass SplitArgs(int max_function_args);
+TVM_DLL Pass SplitArgs(uint64_t max_function_args);
 
 /*!
  * \brief Fuse operations into expr into separate functions.
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index cab3466765b4e..ee2e9c6114fd2 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -722,7 +722,7 @@ inline te::Tensor dynamic_strided_slice(const te::Tensor& x, const te::Tensor& b
 }
 
 /*!
- * \brief Calcluate the output shape of strided_slice, the entry point for Relay type relation
+ * \brief Calculate the output shape of strided_slice, the entry point for Relay type relation
  *
  * \param ishape The input tensor shape
  * \param begin The indices to begin with in the slicing
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index b8af0518b29c5..c162164daace2 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -1376,10 +1376,16 @@ def ToMixedPrecision(mixed_precision_type="float16", missing_op_mode=1):
 def SplitArgs(max_function_args):
     """Split function with huge number of arguments to smaller pieces.
 
+    Parameters
+    ----------
+    max_function_args: int
+      Maximum number of function arguments. If it is 0 then SplitArgs won't split function.
+
+
     Returns
     -------
     ret : tvm.transform.Pass
-        The registered pass for constant folding.
+        The registered pass.
     """
     return _ffi_api.SplitArgs(max_function_args)
 
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 0c834c5f026ef..09ee0ac20dfe1 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -194,7 +194,7 @@ def max_shared_memory_per_block(self):
 
     @property
     def max_function_args(self):
-        return int(self.attrs.get("max_function_args", -1))
+        return int(self.attrs.get("max_function_args", 0))
 
     @property
     def vtcm_capacity(self):
diff --git a/src/relay/analysis/graph_partitioner.cc b/src/relay/analysis/graph_partitioner.cc
index 861fd58d9e5c8..7852d40a56093 100644
--- a/src/relay/analysis/graph_partitioner.cc
+++ b/src/relay/analysis/graph_partitioner.cc
@@ -220,6 +220,88 @@ size_t GraphPartitioner::CountFusedNodesWithNewChild(IndexedForwardGraph::Node*
   return target->FindRoot()->num_nodes + CountNodesUptoSink_(child, dom_parent);
 }
 
+size_t GraphPartitioner::CountAdditionalArgs_(const TensorTypeNode* ttype, bool with_strides) {
+  size_t any_dims = 0;
+  for (const auto& dim : ttype->shape) {
+    if (dim.as<AnyNode>()) {
+      any_dims++;
+    }
+  }
+  if (with_strides && any_dims > 0) any_dims += ttype->shape.size();
+  return any_dims;
+}
+
+size_t GraphPartitioner::CountArgs_(const tvm::Object* child, const tvm::Object* till_node) {
+  if (child == till_node) {
+    if (auto call_node = GetRef<ObjectRef>(child).as<CallNode>()) {
+      if (const auto* ttype = call_node->checked_type().as<TensorTypeNode>()) {
+        return CountAdditionalArgs_(ttype);
+      }
+    }
+    return 1;
+  }
+  // if (argsMap_.count(child)) {
+  //  return argsMap_[child];
+  //}
+  size_t args_num = 0;
+  if (auto call_node = GetRef<ObjectRef>(child).as<CallNode>()) {
+    for (auto& it : call_node->args) {
+      if (it.as<CallNode>() || it.as<TupleNode>()) {
+        args_num += CountArgs_(it.get(), till_node);
+      } else if (it.as<VarNode>() || it.as<TupleGetItemNode>()) {
+        args_num++;
+        if (const auto* ttype = it->checked_type().as<TensorTypeNode>()) {
+          args_num += CountAdditionalArgs_(ttype);
+        }
+      }
+    }
+  } else if (GetRef<ObjectRef>(child).as<VarNode>() ||
+             GetRef<ObjectRef>(child).as<TupleGetItemNode>()) {
+    args_num++;
+    if (const auto* ttype =
+            GetRef<ObjectRef>(child).as<ExprNode>()->checked_type().as<TensorTypeNode>()) {
+      args_num += CountAdditionalArgs_(ttype);
+    }
+  } else if (auto tuple_node = GetRef<ObjectRef>(child).as<TupleNode>()) {
+    for (const auto& it : tuple_node->fields) {
+      args_num++;
+      args_num += CountArgs_(it.get(), till_node);
+    }
+  }
+  // argsMap_[child] = args_num;
+  return args_num;
+}
+
+size_t GraphPartitioner::CountArgsLimit_(const IndexedForwardGraph::Node* child) {
+  auto* outputs_list = child->outputs.head;
+  size_t output_args = 0;
+  while (outputs_list != nullptr) {
+    output_args++;
+    if (auto call_node = GetRef<ObjectRef>(outputs_list->value.node->ref).as<CallNode>()) {
+      if (const auto* ttype = call_node->checked_type().as<TensorTypeNode>()) {
+        output_args += CountAdditionalArgs_(ttype, false);
+      }
+    }
+    outputs_list = outputs_list->next;
+  }
+  return (max_function_args_ > output_args) ? max_function_args_ - output_args : 0;
+}
+
+size_t GraphPartitioner::CountFusedArgs(IndexedForwardGraph::Node* child,
+                                        IndexedForwardGraph::Node* till_node) {
+  const tvm::Object* till_node_ref = (till_node != nullptr) ? till_node->ref : nullptr;
+  auto* outputs_list = child->outputs.head;
+  size_t res = 1;
+  while (outputs_list != nullptr) {
+    size_t output_args = 0;
+    output_args += CountArgs_(outputs_list->value.node->ref, till_node_ref);
+    res = std::max(res, output_args);
+    // argsMap_.erase(outputs_list->value.node->ref);
+    outputs_list = outputs_list->next;
+  }
+  return res;
+}
+
 void GraphPartitioner::InitGroups(const IndexedForwardGraph& graph) {
   groups_.resize(graph.post_dfs_order.size());
   for (size_t nid = 0; nid < groups_.size(); ++nid) {
@@ -238,6 +320,7 @@ void GraphPartitioner::InitGroups(const IndexedForwardGraph& graph) {
 void GraphPartitioner::RunFuse(const IndexedForwardGraph& graph,    //
                                const DominatorTree& post_dom_tree,  //
                                int phase) {
+  IndexedForwardGraph::Node* prev_node = nullptr;
   for (size_t nid = 0; nid < groups_.size(); ++nid) {
     // the group of current node has been specified already.
     auto* graph_node = graph.post_dfs_order[nid];
@@ -254,6 +337,14 @@ void GraphPartitioner::RunFuse(const IndexedForwardGraph& graph,    //
     // refuse the fusion if too many ops are going to be fused together
     if (CountFusedNodesWithNewChild(graph_node, dom_node->parent->gnode) > max_fuse_depth_)
       continue;
+    // refuse the fusion if too many arguments are going to be in fused function
+    auto limit = CountArgsLimit_(graph_node);
+    if (limit > 0) {
+      if (CountFusedArgs(graph_node, prev_node) > limit) {
+        prev_node = graph_node;
+        continue;
+      }
+    }
 
     if (phase == 2) {
       // Fuse injective ops into intermediate tuples, if any
diff --git a/src/relay/analysis/graph_partitioner.h b/src/relay/analysis/graph_partitioner.h
index 9433aafa119d4..0b0bdd40db5b3 100644
--- a/src/relay/analysis/graph_partitioner.h
+++ b/src/relay/analysis/graph_partitioner.h
@@ -78,7 +78,7 @@ class IndexedForwardGraph {
   std::vector<Node*> post_dfs_order;
 
   /*! \brief Dump the graph into string. */
-  void DebugDump() {
+  void DebugDump() const {
     std::ostringstream os;
     for (size_t i = 0; i < post_dfs_order.size(); ++i) {
       Node* node = post_dfs_order[i];
@@ -162,8 +162,12 @@ class DominatorTree {
  */
 class GraphPartitioner {
  public:
-  explicit GraphPartitioner(support::Arena* arena, int opt_level, size_t max_fuse_depth)
-      : arena_(arena), opt_level_(opt_level), max_fuse_depth_(max_fuse_depth) {}
+  explicit GraphPartitioner(support::Arena* arena, int opt_level, size_t max_fuse_depth,
+                            size_t max_function_args)
+      : arena_(arena),
+        opt_level_(opt_level),
+        max_fuse_depth_(max_fuse_depth),
+        max_function_args_(max_function_args) {}
   /*!
    * \brief Group as a union find data structure.
    */
@@ -205,10 +209,14 @@ class GraphPartitioner {
   int opt_level_;
   /*! \brief The maximum number of operations in one fused function */
   size_t max_fuse_depth_;
+  /*! \brief The maximum number of arguments in one fused function */
+  size_t max_function_args_;
   /*! \brief The internal groups. */
   std::vector<Group*> groups_;
   /*! \brief internal field used for deduplication */
   std::unordered_set<IndexedForwardGraph::Node*> visited_;
+  /*! \brief internal field used for hashing arguments number for a node */
+  std::unordered_map<const tvm::Object*, size_t> argsMap_;
   // Internal implementation of CheckPath
   template <typename F>
   bool CheckPath_(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink, F fcond);
@@ -247,6 +255,9 @@ class GraphPartitioner {
   void CommitFuse(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink);
 
   size_t CountNodesUptoSink_(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink);
+  size_t CountAdditionalArgs_(const TensorTypeNode* ttype, bool with_strides = true);
+  size_t CountArgs_(const tvm::Object* child, const tvm::Object* till_node);
+  size_t CountArgsLimit_(const IndexedForwardGraph::Node* child);
 
   // Count the number of nodes in a fused subgraph if child is additionally fused.
   // dom_parent is already known to be a part of the subgraph.
@@ -256,6 +267,11 @@ class GraphPartitioner {
   // is important for correct calculation.
   size_t CountFusedNodesWithNewChild(IndexedForwardGraph::Node* child,
                                      IndexedForwardGraph::Node* dom_parent);
+  // Count the number of arguments in a fused subgraph if child is additionally fused.
+  // Calculation goes from child node till the till_node. If till_node is a
+  // nullptr then calculate arguments till the beginning of the graph.
+  size_t CountFusedArgs(IndexedForwardGraph::Node* child,
+                        IndexedForwardGraph::Node* till_node = nullptr);
 
   // Initialize the groups.
   void InitGroups(const IndexedForwardGraph& graph);
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index abb39a65679e5..83c252d831c51 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -337,7 +337,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (config_->optional_homogeneous_target.defined()) {
       // This pass currently only supports the homogeneous case.
       pass_seqs.push_back(transform::SplitArgs(
-          config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", -1)
+          config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", 0)
               .value()
               .IntValue()));
     }
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index cb79970b25fcb..c5b6c7f2f0403 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1059,6 +1059,13 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   // Always plan devices so the remaining passes don't need to distinguish homogeneous vs
   // heterogeneous execution.
   pass_seqs.push_back(transform::PlanDevices(config_));
+  if (config_->optional_homogeneous_target.defined()) {
+    // This pass currently only supports the homogeneous case.
+    pass_seqs.push_back(transform::SplitArgs(
+        config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", 0)
+            .value()
+            .IntValue()));
+  }
 
   pass_seqs.push_back(transform::FuseOps());
 
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index 9c0d38b115877..ee005aa170523 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -319,9 +319,10 @@ class IndexedForwardGraphCreator : private ExprVisitor {
 
 class FuseMutator : private MixedModeMutator {
  public:
-  FuseMutator(int fuse_opt_level, size_t max_fuse_depth, bool link_params)
+  FuseMutator(int fuse_opt_level, size_t max_fuse_depth, size_t max_function_args, bool link_params)
       : fuse_opt_level_(fuse_opt_level),
         max_fuse_depth_(max_fuse_depth),
+        max_function_args_(max_function_args),
         link_params_(link_params) {}
 
   // Run the transform
@@ -334,7 +335,8 @@ class FuseMutator : private MixedModeMutator {
   Expr Transform(const Expr& body, int fuse_opt_level, size_t max_fuse_depth, bool link_params) {
     // setup the group map.
     auto graph = IndexedForwardGraphCreator::Create(&arena_, body);
-    auto groups = GraphPartitioner(&arena_, fuse_opt_level, max_fuse_depth).Partition(graph);
+    auto groups = GraphPartitioner(&arena_, fuse_opt_level, max_fuse_depth, max_function_args_)
+                      .Partition(graph);
     for (size_t nid = 0; nid < graph.post_dfs_order.size(); ++nid) {
       ICHECK(graph.post_dfs_order[nid]->ref != nullptr);
       gmap_[graph.post_dfs_order[nid]->ref] = groups[nid];
@@ -347,6 +349,7 @@ class FuseMutator : private MixedModeMutator {
  private:
   int fuse_opt_level_;
   size_t max_fuse_depth_;
+  size_t max_function_args_;
   bool link_params_;
 
   using MixedModeMutator::VisitExpr_;
@@ -548,9 +551,10 @@ class FuseMutator : private MixedModeMutator {
   }
 };
 
-Expr FuseOps(const Expr& expr, int fuse_opt_level, size_t max_fuse_depth, bool link_params,
-             const IRModule& module) {
-  return FuseMutator(fuse_opt_level, max_fuse_depth, link_params).Transform(expr);
+Expr FuseOps(const Expr& expr, int fuse_opt_level, size_t max_fuse_depth, size_t max_function_args,
+             bool link_params, const IRModule& module) {
+  return FuseMutator(fuse_opt_level, max_fuse_depth, max_function_args, link_params)
+      .Transform(expr);
 }
 
 namespace transform {
@@ -567,8 +571,13 @@ Pass FuseOps(int fuse_opt_level) {
         link_params = pc->GetConfig("relay.FuseOps.link_params", Bool(link_params)).value();
         int opt_level = fuse_opt_level == -1 ? pc->opt_level : fuse_opt_level;
         auto max_fuse_depth = pc->GetConfig("relay.FuseOps.max_depth", Integer(kMaxFusedOps));
-        return Downcast<Function>(
-            FuseOps(f, opt_level, max_fuse_depth.value().IntValue(), link_params, m));
+        auto target = Target::Current();
+        size_t max_function_args =
+            (target.defined())
+                ? target->GetAttr<Integer>("max_function_args", Integer(0)).value().IntValue()
+                : 0;
+        return Downcast<Function>(FuseOps(f, opt_level, max_fuse_depth.value().IntValue(),
+                                          max_function_args, link_params, m));
       };
   return CreateFunctionPass(pass_func, 0, "FuseOps", {"InferType"});
 }
diff --git a/src/relay/transforms/split_args.cc b/src/relay/transforms/split_args.cc
index 00b9a3be3b2ee..6efab8adf7109 100644
--- a/src/relay/transforms/split_args.cc
+++ b/src/relay/transforms/split_args.cc
@@ -31,58 +31,101 @@ namespace relay {
 
 class ArgumentSplitter : public ExprRewriter {
  public:
-  explicit ArgumentSplitter(int max_function_args)
+  explicit ArgumentSplitter(size_t max_function_args)
       : max_function_args_(max_function_args), concat_op_(Op::Get("concatenate")) {}
 
+  Expr ConcatSplitter(const TupleNode* tuple_node, const tvm::Array<relay::Expr>& args, int axis,
+                      size_t limit) {
+    relay::Expr lastExpr;
+    tvm::Array<relay::Expr> new_args;
+    size_t added_args = 0;
+    for (const auto& it : args) {
+      size_t curr_args = 1;
+      if (const auto* ttype = it->checked_type().as<TensorTypeNode>()) {
+        ICHECK(additional_args_cache_.count(ttype));
+        curr_args += additional_args_cache_[ttype];
+      }
+      if (added_args + curr_args > limit) {
+        Tuple new_tuple = WithFields(GetRef<Tuple>(tuple_node), new_args);
+        Expr body = MakeConcatenate(new_tuple, axis);
+        lastExpr = StopFusion(body);
+        new_args.clear();
+        new_args.push_back(lastExpr);
+        added_args = 1;
+      }
+      added_args += curr_args;
+      new_args.push_back(it);
+    }
+    Tuple new_tuple = WithFields(GetRef<Tuple>(tuple_node), new_args);
+    Expr body = MakeConcatenate(new_tuple, axis);
+    lastExpr = StopFusion(body);
+    return lastExpr;
+  }
+
+  // In case of dynamic shape in tensor, size of any_dims and strides are passed as function args
+  size_t CalculateNumberOfAdditionalArgs_(const TensorTypeNode* arg, bool isOutput = false) {
+    size_t num = 0;
+    for (const auto& dim : arg->shape) {
+      if (dim.as<AnyNode>()) {
+        num++;
+      }
+    }
+    // In case of dynamic shape also strides will be passed to function
+    // as arguments. Number of strides equals to the rank of the tensor.
+    if (num > 0 && isOutput)
+      return arg->shape.size();
+    else if (num > 0)
+      num += arg->shape.size();
+    return num;
+  }
+
   Expr Rewrite_(const CallNode* call, const Expr& post) final {
-    if (max_function_args_ < 0) return post;
+    if (max_function_args_ == 0) return post;
     if (call->op == concat_op_) {
       auto tuple_node = call->args[0].as<TupleNode>();
       const auto param = call->attrs.as<ConcatenateAttrs>();
-      int outputsNum = 1;
+      size_t outputsNum = 1;
       if (const auto* tuple_type = call->checked_type().as<TupleTypeNode>()) {
         outputsNum = tuple_type->fields.size();
+        for (const auto& it : tuple_type->fields) {
+          if (const auto* ttype = it.as<TensorTypeNode>()) {
+            outputsNum += CalculateNumberOfAdditionalArgs_(ttype, true);
+          }
+        }
+      } else if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+        outputsNum += CalculateNumberOfAdditionalArgs_(ttype, true);
       }
-      const int limit = max_function_args_ - outputsNum;
-      int argsNum = tuple_node->fields.size();
-      if (argsNum < limit) return post;
-      int splitNum = argsNum / limit;
-      splitNum = (argsNum % limit) ? splitNum + 1 : splitNum;
-
-      std::vector<Expr> splitted(splitNum);
-      for (int i = 0; i < splitNum; ++i) {
-        int startIdx = i * limit;
-        int argsCount = std::min(limit, argsNum - startIdx);
-        tvm::Array<Expr> args;
-        args.reserve(argsCount);
+      CHECK_GT(max_function_args_, outputsNum);
+      size_t limit = max_function_args_ - outputsNum;
 
-        for (int j = 0; j < argsCount; ++j) {
-          args.push_back(tuple_node->fields[j + startIdx]);
+      size_t argsNum = tuple_node->fields.size();
+      for (const auto& it : tuple_node->fields) {
+        if (const auto* ttype = it->checked_type().as<TensorTypeNode>()) {
+          size_t any_dims = CalculateNumberOfAdditionalArgs_(ttype);
+          argsNum += any_dims;
+          additional_args_cache_[ttype] = any_dims;
         }
-        Tuple new_tuple = WithFields(GetRef<Tuple>(tuple_node), args);
-        Expr body = MakeConcatenate(new_tuple, param->axis);
-        splitted[i] = StopFusion(body);
       }
-      tvm::Array<Expr> tuple_args(splitted);
-      Tuple new_tuple = WithFields(GetRef<Tuple>(tuple_node), tuple_args);
-      return MakeConcatenate(new_tuple, param->axis);
+      if (argsNum < limit) return post;
+      return ConcatSplitter(tuple_node, tuple_node->fields, param->axis, limit);
     }
     return post;
   }
 
  private:
-  const int max_function_args_;
+  const size_t max_function_args_;
   const Op& concat_op_;
+  std::unordered_map<const TensorTypeNode*, size_t> additional_args_cache_;
 };
 
-Expr SplitArgs(const Expr& expr, int max_function_args) {
+Expr SplitArgs(const Expr& expr, size_t max_function_args) {
   auto rewriter = ArgumentSplitter(max_function_args);
   return PostOrderRewrite(expr, &rewriter);
 }
 
 namespace transform {
 
-Pass SplitArgs(int max_function_args) {
+Pass SplitArgs(uint64_t max_function_args) {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
       [=](Function f, IRModule m, PassContext pc) {
         auto r = Downcast<Function>(SplitArgs(f, max_function_args));
diff --git a/src/target/source/codegen_metal.cc b/src/target/source/codegen_metal.cc
index b7105e4bcdfc1..b8c30691e21fa 100644
--- a/src/target/source/codegen_metal.cc
+++ b/src/target/source/codegen_metal.cc
@@ -68,8 +68,8 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
 
   // Buffer arguments
   size_t num_buffer = 0;
-  int limit = target_->GetAttr<Integer>("max_function_args").value().IntValue();
-  if (static_cast<int>(f->params.size()) > limit) {
+  size_t limit = target_->GetAttr<Integer>("max_function_args").value().IntValue();
+  if (f->params.size() > limit) {
     LOG(WARNING) << "Probably you won't be able to execute your kernel due to high number of "
                     "buffers in the kernel";
   }
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 9af50d3f54edb..5431d7eceb5d7 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -365,6 +365,11 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
     .add_attr_option<Integer>("max_num_threads", Integer(256))
     .add_attr_option<Integer>("thread_warp_size", Integer(1))
     .add_attr_option<Integer>("texture_spatial_limit", Integer(16384))
+    // Faced that Qualcomm OpenCL runtime was crashed without any error message in
+    // case when the number of kernel arguments was pretty big. OpenCL doesn't
+    // specify any limitations on the number of kernel arguments. max_function_args
+    // equals to 128 looks like a reasonable number of kernel arguments.
+    .add_attr_option<Integer>("max_function_args", Integer(128))
     .set_default_keys({"opencl", "gpu"});
 
 // The metal has some limitations on the number of input parameters. This is why attribute
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 06c93fbc5549e..751eaab5c3121 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -624,6 +624,12 @@ def expected(n, max_fused_ops):
 
     assert tvm.ir.structural_equal(zz, after)
 
+    with tvm.target.Target("opencl"):
+        with tvm.transform.PassContext(config={"relay.FuseOps.max_depth": max_fused_ops}):
+            cl_zz = run_opt_pass(z, transform.FuseOps())
+
+    assert tvm.ir.structural_equal(cl_zz, after)
+
 
 link_params = tvm.testing.parameter(False, True)
 
@@ -828,5 +834,129 @@ def expected():
         tvm.testing.assert_allclose(result, ref, rtol=1e-4, atol=1e-4)
 
 
+target_name = tvm.testing.parameter("opencl", "metal", "cuda")
+shape_type = tvm.testing.parameter("dynamic", "static")
+
+
+def test_fuse_max_num_args(target_name, shape_type):
+    if shape_type == "dynamic":
+        shape = (tvm.tir.Any(), 20)
+        number_of_any_dims = 1
+    else:
+        shape = (10, 20)
+        number_of_any_dims = 0
+    ndims = len(shape)
+    ops_num = 300
+
+    def _base_func(name):
+        x = relay.var(name, shape=shape)
+        y = relay.add(x, relay.const(1, "float32"))
+        w = relay.exp(y)
+        return x, w
+
+    def before(n):
+        inp = []
+        out = []
+        for i in range(n):
+            x, w = _base_func(f"x{i}")
+            inp.append(x)
+            out.append(w)
+        w = out[0]
+        for i in range(len(out) - 1):
+            w = relay.add(w, out[i + 1])
+        return relay.Function(inp, w)
+
+    def after(n):
+        def create_base_funcs_sum(args_number, limit, prev=None):
+            added_args = 0
+            inputs = []
+            input_vars = []
+            additional_arg = 0 if prev is None else 1
+            res = None
+            for i in range(args_number + additional_arg):
+                inp, out = _base_func(f"p{i}")
+                if i == 1 and prev is not None:
+                    res = relay.add(inp, res)
+                    input_vars.append(relay.var(f"x{i}", shape=shape))
+                    inputs.append(inp)
+                    added_args += 1 + number_of_any_dims
+                    if number_of_any_dims > 0:
+                        added_args += ndims
+                    continue
+
+                curr_args = 1 + number_of_any_dims
+                if number_of_any_dims > 0:
+                    curr_args += ndims
+
+                if added_args + curr_args > limit:
+                    f = relay.Function(inputs, res)
+                    f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+                    return i - additional_arg, input_vars, f
+
+                input_vars.append(relay.var(f"x{i}", shape=shape))
+                inputs.append(inp)
+                if res is None:
+                    res = out
+                else:
+                    res = relay.add(res, out)
+                added_args += curr_args
+            f = relay.Function(inputs, res)
+            f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+            return args_number, input_vars, f
+
+        def create_accum_func(args_limit):
+            out = None
+            inputs = []
+            if args_limit == 0:
+                for i in range(n):
+                    inputs.append(relay.var(f"x{i}", shape=shape))
+                f = before(n)
+                f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+                out = relay.Call(f, inputs)
+                return relay.Function(inputs, out)
+
+            added_args = 0
+            while added_args < n:
+                # When out is not none that means that one additional argument
+                # will be used for the result of previous fusing
+                args_number = n - added_args
+                a_num, inp, func = create_base_funcs_sum(args_number, args_limit, out)
+                added_args += a_num
+                inputs.append(inp[0])
+                if len(inp) > 1:
+                    if out is not None:
+                        inp[1] = out
+                    else:
+                        inputs.append(inp[1])
+                else:
+                    if out is not None:
+                        inp.append(out)
+                if len(inp) > 2:
+                    inputs.extend(inp[2:])
+                out = relay.Call(func, inp)
+            return relay.Function(inputs, out)
+
+        args_limit = tvm.target.Target.current().max_function_args - (
+            1 + number_of_any_dims
+        )  # one buffer with output
+        args_limit = max(args_limit, 0)
+        return create_accum_func(args_limit)
+
+    max_fused_ops = ops_num * 5
+    with tvm.target.Target(target_name):
+        with tvm.transform.PassContext(config={"relay.FuseOps.max_depth": max_fused_ops}):
+            fused = run_opt_pass(before(ops_num), transform.FuseOps())
+
+        expected = run_opt_pass(after(ops_num), transform.InferType())
+        print("*" * 10)
+        print(fused)
+        print("*" * 10)
+        print("-" * 10)
+        print(expected)
+        print("-" * 10)
+
+    assert tvm.ir.structural_equal(fused, expected)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relay/test_pass_split_args.py b/tests/python/relay/test_pass_split_args.py
index 2039f464751f6..83f224d25e671 100644
--- a/tests/python/relay/test_pass_split_args.py
+++ b/tests/python/relay/test_pass_split_args.py
@@ -22,6 +22,10 @@
 from tvm.relay.testing import run_infer_type, create_workload
 
 
+target_name = tvm.testing.parameter("opencl", "metal", "cuda")
+shape_type = tvm.testing.parameter("dynamic", "static")
+
+
 def run_opt_pass(expr, opt_pass):
     assert isinstance(opt_pass, tvm.transform.Pass)
 
@@ -32,65 +36,70 @@ def run_opt_pass(expr, opt_pass):
     return entry if isinstance(expr, relay.Function) else entry.body
 
 
-def test_split_concat_metal():
-    shape = (1, 1, 1, 3)
-    dtype = "float32"
-    axis = 1
-    inputs = []
-    for i in range(100):
-        inputs.append(relay.var("p{}".format(i), shape=shape, dtype=dtype))
-
-    def before():
-        inp = relay.Tuple(inputs)
-        return relay.op.concatenate(inp, axis)
-
-    def expected():
-        limit = tvm.target.Target("metal").max_function_args - 1  # one buffer with output
-        splitNum = int(len(inputs) / limit)
-        if len(inputs) % limit > 0:
-            splitNum += 1
-
-        splitted = []
-        for i in range(splitNum):
-            startIdx = i * limit
-            argsCount = min(limit, len(inputs) - startIdx)
-            args = []
-            for j in range(argsCount):
-                args.append(inputs[j + startIdx])
-            t = relay.Tuple(args)
-            concat = relay.op.concatenate(t, axis)
-            splitted.append(relay.annotation.stop_fusion(concat))
-        inp = relay.Tuple(splitted)
-        return relay.op.concatenate(inp, axis)
-
-    # the fold constant should work on any context.
-    res = run_opt_pass(before(), transform.SplitArgs(tvm.target.Target("metal").max_function_args))
-    exp = run_opt_pass(expected(), transform.InferType())
-    assert tvm.ir.structural_equal(res, exp)
-
-
-def test_split_concat_cuda():
-    shape = (1, 1, 1, 3)
+def test_split_concat(target_name, shape_type):
+    if shape_type == "dynamic":
+        shape = (tvm.tir.Any(), 1, 1, 3)
+        number_of_any_dims = 1
+    else:
+        shape = (1, 1, 1, 3)
+        number_of_any_dims = 0
+    ndims = len(shape)
     dtype = "float32"
     axis = 1
+    tensors_num = 300
     inputs = []
-    for i in range(100):
+    for i in range(tensors_num):
         inputs.append(relay.var("p{}".format(i), shape=shape, dtype=dtype))
 
     def before():
         inp = relay.Tuple(inputs)
         return relay.op.concatenate(inp, axis)
 
-    def expected():
-        inp = relay.Tuple(inputs)
-        return relay.op.concatenate(inp, axis)
+    def expected(limit):
+        if limit == 0:
+            return before()
+        limit = limit - 1  # one buffer with output
+        if number_of_any_dims > 0:
+            limit -= ndims
+
+        last_op = None
+        new_args = []
+        added_args = 0
+        num_inputs = 0
+        for inp in inputs:
+            curr_args = 1 + number_of_any_dims
+            if number_of_any_dims > 0:
+                curr_args += ndims
+            num_inputs += curr_args
+            if added_args + curr_args > limit:
+                t = relay.Tuple(new_args)
+                concat = relay.op.concatenate(t, axis)
+                last_op = relay.annotation.stop_fusion(concat)
+                new_args = [last_op]
+                added_args = 1
+            added_args += curr_args
+            new_args.append(inp)
+        t = relay.Tuple(new_args)
+        concat = relay.op.concatenate(t, axis)
+        last_op = relay.annotation.stop_fusion(concat)
+
+        if num_inputs < limit:
+            return before()
+
+        return last_op
 
     # the fold constant should work on any context.
-    res = run_opt_pass(before(), transform.SplitArgs(tvm.target.Target("cuda").max_function_args))
-    exp = run_opt_pass(expected(), transform.InferType())
+    res = run_opt_pass(
+        before(),
+        transform.SplitArgs(tvm.target.Target(target_name).max_function_args),
+    )
+    exp = run_opt_pass(
+        expected(tvm.target.Target(target_name).max_function_args), transform.InferType()
+    )
+    limit = tvm.target.Target(target_name).max_function_args
+    exp = run_opt_pass(expected(limit), transform.InferType())
     assert tvm.ir.structural_equal(res, exp)
 
 
 if __name__ == "__main__":
-    test_split_concat_metal()
-    test_split_concat_cuda()
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py
index 83612b7f59798..b03999c9fa6a1 100644
--- a/tests/python/unittest/test_target_codegen_opencl.py
+++ b/tests/python/unittest/test_target_codegen_opencl.py
@@ -15,9 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te
+from tvm import te, relay
 import tvm.testing
 import re
+import pytest
+import numpy as np
 
 target = "opencl"
 
@@ -217,5 +219,194 @@ def _check(target, n, dtype):
     _check(target, 32, "float32")
 
 
+shape_type = tvm.testing.parameter("dynamic", "static")
+executor_type = tvm.testing.parameter("ge", "vm")
+
+
+@tvm.testing.requires_gpu
+@tvm.testing.requires_opencl
+def test_opencl_args_split(executor_type, shape_type):
+    def _get_model():
+        if shape_type == "dynamic":
+            shape = (tvm.tir.Any(), 1, 1, 3)
+        else:
+            shape = (1, 1, 1, 3)
+        shape_np = (1, 1, 1, 3)
+        dtype = "float32"
+        axis = 1
+        tensors_num = 300
+        inputs = []
+        inputs_np = {}
+        for i in range(tensors_num):
+            inputs.append(relay.var("p{}".format(i), shape=shape, dtype=dtype))
+            inputs_np[f"p{i}"] = np.random.uniform(size=shape_np).astype(dtype)
+
+        inp = relay.Tuple(inputs)
+        concat = relay.op.concatenate(inp, axis)
+        return inputs_np, relay.Function(inputs, concat)
+
+    def ref_impl(inputs):
+        axis = 1
+        return np.concatenate(tuple(inputs), axis=axis)
+
+    def get_maximum_kernel_args(source):
+        def get_kernel_args(source):
+            import re
+
+            p = re.compile(r"__kernel void .+\((.*)\)")
+            args = p.findall(source)
+            return args
+
+        args = get_kernel_args(source)
+        max_args = len(args[0].split(","))
+        for arg_line in args:
+            max_args = max(max_args, len(arg_line.split(",")))
+        return max_args
+
+    def validate():
+        from tvm.contrib import graph_executor
+        from tvm.runtime.vm import VirtualMachine
+
+        input_dict, model = _get_model()
+        if executor_type == "ge":
+            with tvm.transform.PassContext(opt_level=3):
+                lib = relay.build(model, target_host="llvm", target=target)
+            ocl_lib = lib.get_lib()
+        else:
+            module = tvm.IRModule({})
+            module["main"] = model
+            with tvm.transform.PassContext(opt_level=1):
+                lib = relay.vm.compile(module, target=target, target_host="llvm")
+            ocl_lib = lib.module.imported_modules[0]
+        opencl_modules = list(
+            filter(lambda mod: mod.type_key == "opencl", ocl_lib.imported_modules)
+        )
+        assembly = opencl_modules[0].get_source()
+        with tvm.target.Target(target):
+            limit = tvm.target.Target.current().max_function_args
+        max_num = get_maximum_kernel_args(assembly)
+        assert max_num <= limit
+
+        dev = tvm.cl()
+        if executor_type == "ge":
+            module = graph_executor.GraphModule(lib["default"](dev))
+            module.set_input(**input_dict)
+            module.run()
+            tvm_out = module.get_output(0)
+        else:
+            vm = VirtualMachine(lib, dev, "naive")
+            data = {}
+            for k, v in input_dict.items():
+                data[k] = tvm.nd.array(v, dev)
+            vm.set_input("main", **data)
+            vm.invoke_stateful("main")
+            tvm_out = vm.get_outputs()[0]
+
+        np_result = ref_impl(list(input_dict.values()))
+        np.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-2, atol=1e-2)
+
+    if executor_type == "ge" and shape_type == "dynamic":
+        pytest.skip()
+    validate()
+
+
+@tvm.testing.requires_gpu
+@tvm.testing.requires_opencl
+def test_opencl_fuse_max_args(executor_type, shape_type):
+    if shape_type == "dynamic":
+        shape = (tvm.tir.Any(), 20)
+    else:
+        shape = (1, 20)
+    shape_np = (1, 20)
+    ops_num = 300
+    dtype = "float32"
+
+    def _base_func(name):
+        x = relay.var(name, shape=shape)
+        y = relay.add(x, relay.const(1, "float32"))
+        w = relay.exp(y)
+        return x, w
+
+    def _get_model():
+        inp = []
+        inputs_np = {}
+        out = []
+        for i in range(ops_num):
+            x, w = _base_func(f"x{i}")
+            inputs_np[f"x{i}"] = np.random.uniform(size=shape_np).astype(dtype)
+            inp.append(x)
+            out.append(w)
+        w = out[0]
+        for i in range(len(out) - 1):
+            w = relay.add(w, out[i + 1])
+        return inputs_np, relay.Function(inp, w)
+
+    def ref_impl(inputs):
+        w = np.exp(inputs[0] + 1)
+        for i in range(len(inputs) - 1):
+            w = w + np.exp(inputs[i + 1] + 1)
+        return w
+
+    def get_maximum_kernel_args(source):
+        def get_kernel_args(source):
+            import re
+
+            p = re.compile(r"__kernel void .+\((.*)\)")
+            args = p.findall(source)
+            return args
+
+        args = get_kernel_args(source)
+        max_args = len(args[0].split(","))
+        for arg_line in args:
+            max_args = max(max_args, len(arg_line.split(",")))
+        return max_args
+
+    def validate():
+        from tvm.contrib import graph_executor
+        from tvm.runtime.vm import VirtualMachine
+
+        input_dict, model = _get_model()
+        if executor_type == "ge":
+            with tvm.transform.PassContext(opt_level=3):
+                lib = relay.build(model, target_host="llvm", target=target)
+            ocl_lib = lib.get_lib()
+        else:
+            module = tvm.IRModule({})
+            module["main"] = model
+            with tvm.transform.PassContext(opt_level=1):
+                lib = relay.vm.compile(module, target=target, target_host="llvm")
+            ocl_lib = lib.module.imported_modules[0]
+        opencl_modules = list(
+            filter(lambda mod: mod.type_key == "opencl", ocl_lib.imported_modules)
+        )
+        assembly = opencl_modules[0].get_source()
+        with tvm.target.Target(target):
+            limit = tvm.target.Target.current().max_function_args
+        max_num = get_maximum_kernel_args(assembly)
+        assert max_num <= limit
+
+        dev = tvm.cl()
+        if executor_type == "ge":
+            module = graph_executor.GraphModule(lib["default"](dev))
+            module.set_input(**input_dict)
+            module.run()
+            tvm_out = module.get_output(0)
+        else:
+            vm = VirtualMachine(lib, dev, "naive")
+            data = {}
+            for k, v in input_dict.items():
+                data[k] = tvm.nd.array(v, dev)
+            vm.set_input("main", **data)
+            vm.invoke_stateful("main")
+            tvm_out = vm.get_outputs()[0]
+
+        np_result = ref_impl(list(input_dict.values()))
+        np.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-2, atol=1e-2)
+
+    if executor_type == "ge" and shape_type == "dynamic":
+        pytest.skip()
+    validate()
+
+
 if __name__ == "__main__":
     tvm.testing.main()