diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 5f591f1d89ad4..da4d05f0e63e7 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -120,9 +120,12 @@ TVM_DLL Pass FoldConstant(bool fold_qnn = false);
 /*!
  * \brief Split function with huge number of arguments to smaller pieces.
  *
+ * \param max_function_args Maximum number of function arguments. If it is 0 then SplitArgs won't
+ *                          split function.
+ *
  * \return The pass.
  */
-TVM_DLL Pass SplitArgs(int max_function_args);
+TVM_DLL Pass SplitArgs(uint64_t max_function_args);
 
 /*!
  * \brief Fuse operations into expr into separate functions.
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index d881c4f423338..a40c7d63031ea 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -723,7 +723,7 @@ inline te::Tensor dynamic_strided_slice(const te::Tensor& x, const te::Tensor& b
 }
 
 /*!
- * \brief Calcluate the output shape of strided_slice, the entry point for Relay type relation
+ * \brief Calculate the output shape of strided_slice, the entry point for Relay type relation
  *
  * \param ishape The input tensor shape
  * \param begin The indices to begin with in the slicing
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 6b488719eb84c..26caa4584c79b 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -23,7 +23,7 @@
 
 from . import _make
 from .dyn import _make as _dyn_make
-from ..expr import Tuple, Expr, Constant
+from ..expr import Tuple, Expr, Constant, Call
 from . import op as reg
 
 
@@ -1141,12 +1141,15 @@ def concatenate(data, axis):
     result: relay.Expr
         The concatenated tensor.
     """
-    data = list(data)
+    if not isinstance(data, Call):
+        data = list(data)
     if not data:
         raise ValueError("relay.concatenate requires data to be non-empty.")
+    if not isinstance(data, Call):
+        data = Tuple(data)
     if not isinstance(axis, int):
         raise ValueError("For now, we only support integer axis")
-    return _make.concatenate(Tuple(data), axis)
+    return _make.concatenate(data, axis)
 
 
 def einsum(data, equation):
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index b8af0518b29c5..c162164daace2 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -1376,10 +1376,16 @@ def ToMixedPrecision(mixed_precision_type="float16", missing_op_mode=1):
 def SplitArgs(max_function_args):
     """Split function with huge number of arguments to smaller pieces.
 
+    Parameters
+    ----------
+    max_function_args: int
+      Maximum number of function arguments. If it is 0 then SplitArgs won't split function.
+
+
     Returns
     -------
     ret : tvm.transform.Pass
-        The registered pass for constant folding.
+        The registered pass.
     """
     return _ffi_api.SplitArgs(max_function_args)
 
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 0c834c5f026ef..09ee0ac20dfe1 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -194,7 +194,7 @@ def max_shared_memory_per_block(self):
 
     @property
     def max_function_args(self):
-        return int(self.attrs.get("max_function_args", -1))
+        return int(self.attrs.get("max_function_args", 0))
 
     @property
     def vtcm_capacity(self):
diff --git a/src/relay/analysis/graph_partitioner.cc b/src/relay/analysis/graph_partitioner.cc
index 861fd58d9e5c8..46dd99a25ab0d 100644
--- a/src/relay/analysis/graph_partitioner.cc
+++ b/src/relay/analysis/graph_partitioner.cc
@@ -169,6 +169,7 @@ void GraphPartitioner::MergeFromTo(Group* child, Group* parent) {
   if (child == parent) return;
   // update the number of nodes of the parent group
   parent->num_nodes += child->num_nodes;
+  parent->args_num += child->args_num;
   child->parent = parent;
   // update anchor ref and pattern
   if (child->anchor_ref != nullptr) {
@@ -193,6 +194,10 @@ void GraphPartitioner::CommitFuse_(IndexedForwardGraph::Node* src, IndexedForwar
 }
 
 void GraphPartitioner::CommitFuse(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink) {
+  if (postpone_node_ != nullptr) {
+    postponed_fusing_map_.insert({postpone_node_, src});
+    return;
+  }
   Group* target = groups_[sink->index];
   visited_.clear();
   ICHECK(src != sink);
@@ -220,7 +225,112 @@ size_t GraphPartitioner::CountFusedNodesWithNewChild(IndexedForwardGraph::Node*
   return target->FindRoot()->num_nodes + CountNodesUptoSink_(child, dom_parent);
 }
 
+size_t GraphPartitioner::CountArgs_(IndexedForwardGraph::Node* src,
+                                    const IndexedForwardGraph& graph, bool update_postpone) {
+  std::unordered_set<Group*> visited_groups;
+  Group* gnode = groups_[src->index];
+  ICHECK(gnode != nullptr);
+  auto sum = gnode->args_num;
+  visited_groups.insert(gnode->FindRoot());
+  auto calcArgs = [this, src, &graph, &visited_groups,
+                   update_postpone](const relay::Expr& arg) -> size_t {
+    if (arg.as<VarNode>()) return 0;
+    auto* node = graph.node_map.at(arg.get());
+    Group* prev_group = groups_[node->index]->FindRoot();
+    if (visited_groups.count(prev_group) == 0) {
+      visited_groups.insert(prev_group);
+      if (prev_group->args_num > 0) {
+        // Get number of arguments from group
+        return prev_group->args_num;
+      } else if (update_postpone) {
+        // Update pointer to node which should be postponed for deferred fusing
+        postpone_node_ = src;
+      } else {
+        // Calculate number of arguments for the node which wasn't processed before
+        return CountArgs_(node, graph, update_postpone);
+      }
+    }
+    return 0;
+  };
+  if (auto call_node = GetRef<ObjectRef>(src->ref).as<CallNode>()) {
+    for (auto& it : call_node->args) {
+      sum += calcArgs(it);
+    }
+  } else if (auto tuple_node = GetRef<ObjectRef>(src->ref).as<TupleNode>()) {
+    for (auto& it : tuple_node->fields) {
+      sum += calcArgs(it);
+    }
+  }
+  return sum;
+}
+
+size_t GraphPartitioner::CountAdditionalArgs_(const TensorTypeNode* ttype, bool with_strides) {
+  size_t any_dims = 0;
+  for (const auto& dim : ttype->shape) {
+    if (dim.as<AnyNode>()) {
+      any_dims++;
+    }
+  }
+  if (with_strides && any_dims > 0) any_dims += ttype->shape.size();
+  return any_dims;
+}
+
+size_t GraphPartitioner::CountArgsLimit_(const IndexedForwardGraph::Node* child) {
+  auto* outputs_list = child->outputs.head;
+  size_t output_args = 0;
+  while (outputs_list != nullptr) {
+    output_args++;
+    if (auto call_node = GetRef<ObjectRef>(outputs_list->value.node->ref).as<CallNode>()) {
+      if (const auto* ttype = call_node->checked_type().as<TensorTypeNode>()) {
+        output_args += CountAdditionalArgs_(ttype, false);
+      }
+    }
+    outputs_list = outputs_list->next;
+  }
+  return (max_function_args_ > output_args) ? max_function_args_ - output_args : 0;
+}
+
+size_t GraphPartitioner::CountFusedArgs(const IndexedForwardGraph& graph,
+                                        IndexedForwardGraph::Node* child) {
+  size_t args_num = 0;
+  auto* outputs_list = child->outputs.head;
+  while (outputs_list != nullptr) {
+    args_num = std::max(args_num, CountArgs_(outputs_list->value.node, graph));
+    outputs_list = outputs_list->next;
+  }
+  return args_num;
+}
+
 void GraphPartitioner::InitGroups(const IndexedForwardGraph& graph) {
+  auto args_counter = [this](const tvm::Object* obj) {
+    size_t args_num = 0;
+    if (auto call_node = GetRef<ObjectRef>(obj).as<CallNode>()) {
+      for (auto& it : call_node->args) {
+        if (it.as<VarNode>() || it.as<TupleGetItemNode>()) {
+          args_num++;
+          if (const auto* ttype = it.as<ExprNode>()->checked_type().as<TensorTypeNode>()) {
+            args_num += CountAdditionalArgs_(ttype);
+          }
+        }
+      }
+    } else if (auto tuple_node = GetRef<ObjectRef>(obj).as<TupleNode>()) {
+      for (auto& it : tuple_node->fields) {
+        if (it.as<VarNode>() || it.as<TupleGetItemNode>()) {
+          args_num++;
+          if (const auto* ttype = it.as<ExprNode>()->checked_type().as<TensorTypeNode>()) {
+            args_num += CountAdditionalArgs_(ttype);
+          }
+        }
+      }
+    } else if (GetRef<ObjectRef>(obj).as<VarNode>()) {
+      args_num++;
+      if (const auto* ttype =
+              GetRef<ObjectRef>(obj).as<ExprNode>()->checked_type().as<TensorTypeNode>()) {
+        args_num += CountAdditionalArgs_(ttype);
+      }
+    }
+    return args_num;
+  };
   groups_.resize(graph.post_dfs_order.size());
   for (size_t nid = 0; nid < groups_.size(); ++nid) {
     const auto* graph_node = graph.post_dfs_order[nid];
@@ -231,6 +341,7 @@ void GraphPartitioner::InitGroups(const IndexedForwardGraph& graph) {
     if (group_node->pattern == relay::kOutEWiseFusable) {
       group_node->anchor_ref = graph_node->ref;
     }
+    group_node->args_num = args_counter(graph_node->ref);
     groups_[nid] = group_node;
   }
 }
@@ -244,6 +355,19 @@ void GraphPartitioner::RunFuse(const IndexedForwardGraph& graph,    //
     auto* dom_node = post_dom_tree.nodes[nid];
     Group* group_node = groups_[nid];
     ICHECK(group_node != nullptr);
+    postpone_node_ = nullptr;
+    if (postponed_fusing_map_.count(graph_node)) {
+      auto range = postponed_fusing_map_.equal_range(graph_node);
+      for (auto it = range.first; it != range.second; ++it) {
+        if (CountArgs_(graph_node, graph, false) <= CountArgsLimit_(graph_node)) {
+        auto* src = it->second;
+          auto* snode = post_dom_tree.nodes[src->index]->parent->gnode;
+          if (groups_[snode->index]->anchor_ref != nullptr) continue;
+          CommitFuse(src, snode);
+        }
+      }
+      postponed_fusing_map_.erase(graph_node);
+    }
     // no actions for opaque nodes
     if (group_node->pattern == kOpaque) continue;
     // no actions needed if the current node have no dominator
@@ -254,6 +378,19 @@ void GraphPartitioner::RunFuse(const IndexedForwardGraph& graph,    //
     // refuse the fusion if too many ops are going to be fused together
     if (CountFusedNodesWithNewChild(graph_node, dom_node->parent->gnode) > max_fuse_depth_)
       continue;
+    // refuse the fusion if too many arguments are going to be in fused function
+    if (max_function_args_ > 0) {
+      auto limit = CountArgsLimit_(graph_node);
+      if (limit > 0) {
+        auto args = CountFusedArgs(graph, graph_node);
+        //std::cout << "args: " << args << ", limit: " << limit << std::endl;
+        if (args > limit) {
+          //std::cout << " >>> args: " << args << ", limit: " << limit << std::endl;
+          //Dump(graph_node->ref);
+          continue;
+        }
+      }
+    }
 
     if (phase == 2) {
       // Fuse injective ops into intermediate tuples, if any
diff --git a/src/relay/analysis/graph_partitioner.h b/src/relay/analysis/graph_partitioner.h
index 9433aafa119d4..a391d1464da66 100644
--- a/src/relay/analysis/graph_partitioner.h
+++ b/src/relay/analysis/graph_partitioner.h
@@ -78,7 +78,7 @@ class IndexedForwardGraph {
   std::vector<Node*> post_dfs_order;
 
   /*! \brief Dump the graph into string. */
-  void DebugDump() {
+  void DebugDump() const {
     std::ostringstream os;
     for (size_t i = 0; i < post_dfs_order.size(); ++i) {
       Node* node = post_dfs_order[i];
@@ -162,8 +162,12 @@ class DominatorTree {
  */
 class GraphPartitioner {
  public:
-  explicit GraphPartitioner(support::Arena* arena, int opt_level, size_t max_fuse_depth)
-      : arena_(arena), opt_level_(opt_level), max_fuse_depth_(max_fuse_depth) {}
+  explicit GraphPartitioner(support::Arena* arena, int opt_level, size_t max_fuse_depth,
+                            size_t max_function_args)
+      : arena_(arena),
+        opt_level_(opt_level),
+        max_fuse_depth_(max_fuse_depth),
+        max_function_args_(max_function_args) {}
   /*!
    * \brief Group as a union find data structure.
    */
@@ -183,6 +187,10 @@ class GraphPartitioner {
      * \brief The number of nodes belonging to this group
      */
     uint32_t num_nodes{1};
+    /*!
+     * \brief The number of function arguments belonging to this group
+     */
+    size_t args_num{0};
 
     /*! \brief Optional attributes to annotate the grouped function. */
     runtime::Map<runtime::String, ObjectRef> attrs;
@@ -205,10 +213,21 @@ class GraphPartitioner {
   int opt_level_;
   /*! \brief The maximum number of operations in one fused function */
   size_t max_fuse_depth_;
+  /*! \brief The maximum number of arguments in one fused function */
+  size_t max_function_args_;
   /*! \brief The internal groups. */
   std::vector<Group*> groups_;
   /*! \brief internal field used for deduplication */
   std::unordered_set<IndexedForwardGraph::Node*> visited_;
+  /*! \brief The map with nodes which were postponed for fusing. */
+  std::unordered_multimap<const IndexedForwardGraph::Node*, IndexedForwardGraph::Node*>
+      postponed_fusing_map_;
+  /*!
+   * \brief Fusing of this node should be postponed till all child nodes will be evaluated.
+   *        It is used to calculate number of arguments which will be passed to this node in
+   * generated function.
+   */
+  const IndexedForwardGraph::Node* postpone_node_{nullptr};
   // Internal implementation of CheckPath
   template <typename F>
   bool CheckPath_(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink, F fcond);
@@ -247,6 +266,23 @@ class GraphPartitioner {
   void CommitFuse(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink);
 
   size_t CountNodesUptoSink_(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink);
+  // Count the number of additional arguments. In case of dynamic shape,
+  // generated function takes several additional arguments, such as size of
+  // dynamic dimension and strides.
+  // This function calculates number of such additional arguments.
+  size_t CountAdditionalArgs_(const TensorTypeNode* ttype, bool with_strides = true);
+  // Calculate the number of arguments for the node.
+  size_t CountArgs_(IndexedForwardGraph::Node* src, const IndexedForwardGraph& graph,
+                    bool update_postpone = true);
+  // Count actual limit of arguments for a generated function.
+  // max_function_args_ specifies the number of maximum function arguments. But
+  // usually, output tensors also passed to the function as arguments.
+  // Additionally, in case of dynamic shape, it is necessary to take into
+  // account the number of parameters which specifies the size of dynamic
+  // dimension.
+  // This function computes limit of arguments by the following formula:
+  // limit = max_function_args_ - output_args_count
+  size_t CountArgsLimit_(const IndexedForwardGraph::Node* child);
 
   // Count the number of nodes in a fused subgraph if child is additionally fused.
   // dom_parent is already known to be a part of the subgraph.
@@ -256,6 +292,10 @@ class GraphPartitioner {
   // is important for correct calculation.
   size_t CountFusedNodesWithNewChild(IndexedForwardGraph::Node* child,
                                      IndexedForwardGraph::Node* dom_parent);
+  // Count the number of arguments in a fused subgraph if child is additionally fused.
+  // Calculation goes from child node till the till_node. If till_node is a
+  // nullptr then calculate arguments till the beginning of the graph.
+  size_t CountFusedArgs(const IndexedForwardGraph& graph, IndexedForwardGraph::Node* child);
 
   // Initialize the groups.
   void InitGroups(const IndexedForwardGraph& graph);
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index abb39a65679e5..83c252d831c51 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -337,7 +337,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (config_->optional_homogeneous_target.defined()) {
       // This pass currently only supports the homogeneous case.
       pass_seqs.push_back(transform::SplitArgs(
-          config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", -1)
+          config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", 0)
               .value()
               .IntValue()));
     }
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index cb79970b25fcb..c5b6c7f2f0403 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1059,6 +1059,13 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   // Always plan devices so the remaining passes don't need to distinguish homogeneous vs
   // heterogeneous execution.
   pass_seqs.push_back(transform::PlanDevices(config_));
+  if (config_->optional_homogeneous_target.defined()) {
+    // This pass currently only supports the homogeneous case.
+    pass_seqs.push_back(transform::SplitArgs(
+        config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", 0)
+            .value()
+            .IntValue()));
+  }
 
   pass_seqs.push_back(transform::FuseOps());
 
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index 9c0d38b115877..ee005aa170523 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -319,9 +319,10 @@ class IndexedForwardGraphCreator : private ExprVisitor {
 
 class FuseMutator : private MixedModeMutator {
  public:
-  FuseMutator(int fuse_opt_level, size_t max_fuse_depth, bool link_params)
+  FuseMutator(int fuse_opt_level, size_t max_fuse_depth, size_t max_function_args, bool link_params)
       : fuse_opt_level_(fuse_opt_level),
         max_fuse_depth_(max_fuse_depth),
+        max_function_args_(max_function_args),
         link_params_(link_params) {}
 
   // Run the transform
@@ -334,7 +335,8 @@ class FuseMutator : private MixedModeMutator {
   Expr Transform(const Expr& body, int fuse_opt_level, size_t max_fuse_depth, bool link_params) {
     // setup the group map.
     auto graph = IndexedForwardGraphCreator::Create(&arena_, body);
-    auto groups = GraphPartitioner(&arena_, fuse_opt_level, max_fuse_depth).Partition(graph);
+    auto groups = GraphPartitioner(&arena_, fuse_opt_level, max_fuse_depth, max_function_args_)
+                      .Partition(graph);
     for (size_t nid = 0; nid < graph.post_dfs_order.size(); ++nid) {
       ICHECK(graph.post_dfs_order[nid]->ref != nullptr);
       gmap_[graph.post_dfs_order[nid]->ref] = groups[nid];
@@ -347,6 +349,7 @@ class FuseMutator : private MixedModeMutator {
  private:
   int fuse_opt_level_;
   size_t max_fuse_depth_;
+  size_t max_function_args_;
   bool link_params_;
 
   using MixedModeMutator::VisitExpr_;
@@ -548,9 +551,10 @@ class FuseMutator : private MixedModeMutator {
   }
 };
 
-Expr FuseOps(const Expr& expr, int fuse_opt_level, size_t max_fuse_depth, bool link_params,
-             const IRModule& module) {
-  return FuseMutator(fuse_opt_level, max_fuse_depth, link_params).Transform(expr);
+Expr FuseOps(const Expr& expr, int fuse_opt_level, size_t max_fuse_depth, size_t max_function_args,
+             bool link_params, const IRModule& module) {
+  return FuseMutator(fuse_opt_level, max_fuse_depth, max_function_args, link_params)
+      .Transform(expr);
 }
 
 namespace transform {
@@ -567,8 +571,13 @@ Pass FuseOps(int fuse_opt_level) {
         link_params = pc->GetConfig("relay.FuseOps.link_params", Bool(link_params)).value();
         int opt_level = fuse_opt_level == -1 ? pc->opt_level : fuse_opt_level;
         auto max_fuse_depth = pc->GetConfig("relay.FuseOps.max_depth", Integer(kMaxFusedOps));
-        return Downcast<Function>(
-            FuseOps(f, opt_level, max_fuse_depth.value().IntValue(), link_params, m));
+        auto target = Target::Current();
+        size_t max_function_args =
+            (target.defined())
+                ? target->GetAttr<Integer>("max_function_args", Integer(0)).value().IntValue()
+                : 0;
+        return Downcast<Function>(FuseOps(f, opt_level, max_fuse_depth.value().IntValue(),
+                                          max_function_args, link_params, m));
       };
   return CreateFunctionPass(pass_func, 0, "FuseOps", {"InferType"});
 }
diff --git a/src/relay/transforms/split_args.cc b/src/relay/transforms/split_args.cc
index 00b9a3be3b2ee..6ef404ee814d9 100644
--- a/src/relay/transforms/split_args.cc
+++ b/src/relay/transforms/split_args.cc
@@ -31,58 +31,101 @@ namespace relay {
 
 class ArgumentSplitter : public ExprRewriter {
  public:
-  explicit ArgumentSplitter(int max_function_args)
+  explicit ArgumentSplitter(size_t max_function_args)
       : max_function_args_(max_function_args), concat_op_(Op::Get("concatenate")) {}
 
+  Expr ConcatSplitter(const TupleNode* tuple_node, const tvm::Array<relay::Expr>& args, int axis,
+                      size_t limit) {
+    tvm::Array<relay::Expr> new_args;
+    size_t added_args = 0;
+    for (const auto& it : args) {
+      size_t curr_args = 1;
+      if (const auto* ttype = it->checked_type().as<TensorTypeNode>()) {
+        ICHECK(additional_args_cache_.count(ttype));
+        curr_args += additional_args_cache_[ttype];
+      }
+      if (added_args + curr_args > limit) {
+        Tuple new_tuple = WithFields(GetRef<Tuple>(tuple_node), new_args);
+        Expr stop = StopFusion(new_tuple);
+        Expr lastExpr = MakeConcatenate(stop, axis);
+        new_args.clear();
+        new_args.push_back(lastExpr);
+        added_args = curr_args;
+      }
+      added_args += curr_args;
+      new_args.push_back(it);
+    }
+    Tuple new_tuple = WithFields(GetRef<Tuple>(tuple_node), new_args);
+    Expr stop = StopFusion(new_tuple);
+    Expr lastExpr = MakeConcatenate(stop, axis);
+    return lastExpr;
+  }
+
+  // In case of dynamic shape in tensor, size of any_dims and strides are passed as function args
+  size_t CalculateNumberOfAdditionalArgs_(const TensorTypeNode* arg, bool isOutput = false) {
+    size_t num = 0;
+    for (const auto& dim : arg->shape) {
+      if (dim.as<AnyNode>()) {
+        num++;
+      }
+    }
+    // In case of dynamic shape also strides will be passed to function
+    // as arguments. Number of strides equals to the rank of the tensor.
+    if (num > 0 && isOutput)
+      return arg->shape.size();
+    else if (num > 0)
+      num += arg->shape.size();
+    return num;
+  }
+
   Expr Rewrite_(const CallNode* call, const Expr& post) final {
-    if (max_function_args_ < 0) return post;
+    if (max_function_args_ == 0) return post;
     if (call->op == concat_op_) {
       auto tuple_node = call->args[0].as<TupleNode>();
+      if (tuple_node == nullptr) return post;
       const auto param = call->attrs.as<ConcatenateAttrs>();
-      int outputsNum = 1;
+      size_t outputsNum = 1;
       if (const auto* tuple_type = call->checked_type().as<TupleTypeNode>()) {
         outputsNum = tuple_type->fields.size();
+        for (const auto& it : tuple_type->fields) {
+          if (const auto* ttype = it.as<TensorTypeNode>()) {
+            outputsNum += CalculateNumberOfAdditionalArgs_(ttype, true);
+          }
+        }
+      } else if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+        outputsNum += CalculateNumberOfAdditionalArgs_(ttype, true);
       }
-      const int limit = max_function_args_ - outputsNum;
-      int argsNum = tuple_node->fields.size();
-      if (argsNum < limit) return post;
-      int splitNum = argsNum / limit;
-      splitNum = (argsNum % limit) ? splitNum + 1 : splitNum;
-
-      std::vector<Expr> splitted(splitNum);
-      for (int i = 0; i < splitNum; ++i) {
-        int startIdx = i * limit;
-        int argsCount = std::min(limit, argsNum - startIdx);
-        tvm::Array<Expr> args;
-        args.reserve(argsCount);
+      CHECK_GT(max_function_args_, outputsNum);
+      size_t limit = max_function_args_ - outputsNum;
 
-        for (int j = 0; j < argsCount; ++j) {
-          args.push_back(tuple_node->fields[j + startIdx]);
+      size_t argsNum = tuple_node->fields.size();
+      for (const auto& it : tuple_node->fields) {
+        if (const auto* ttype = it->checked_type().as<TensorTypeNode>()) {
+          size_t any_dims = CalculateNumberOfAdditionalArgs_(ttype);
+          argsNum += any_dims;
+          additional_args_cache_[ttype] = any_dims;
         }
-        Tuple new_tuple = WithFields(GetRef<Tuple>(tuple_node), args);
-        Expr body = MakeConcatenate(new_tuple, param->axis);
-        splitted[i] = StopFusion(body);
       }
-      tvm::Array<Expr> tuple_args(splitted);
-      Tuple new_tuple = WithFields(GetRef<Tuple>(tuple_node), tuple_args);
-      return MakeConcatenate(new_tuple, param->axis);
+      if (argsNum < limit) return post;
+      return ConcatSplitter(tuple_node, tuple_node->fields, param->axis, limit);
     }
     return post;
   }
 
  private:
-  const int max_function_args_;
+  const size_t max_function_args_;
   const Op& concat_op_;
+  std::unordered_map<const TensorTypeNode*, size_t> additional_args_cache_;
 };
 
-Expr SplitArgs(const Expr& expr, int max_function_args) {
+Expr SplitArgs(const Expr& expr, size_t max_function_args) {
   auto rewriter = ArgumentSplitter(max_function_args);
   return PostOrderRewrite(expr, &rewriter);
 }
 
 namespace transform {
 
-Pass SplitArgs(int max_function_args) {
+Pass SplitArgs(uint64_t max_function_args) {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
       [=](Function f, IRModule m, PassContext pc) {
         auto r = Downcast<Function>(SplitArgs(f, max_function_args));
diff --git a/src/target/source/codegen_metal.cc b/src/target/source/codegen_metal.cc
index b7105e4bcdfc1..b8c30691e21fa 100644
--- a/src/target/source/codegen_metal.cc
+++ b/src/target/source/codegen_metal.cc
@@ -68,8 +68,8 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
 
   // Buffer arguments
   size_t num_buffer = 0;
-  int limit = target_->GetAttr<Integer>("max_function_args").value().IntValue();
-  if (static_cast<int>(f->params.size()) > limit) {
+  size_t limit = target_->GetAttr<Integer>("max_function_args").value().IntValue();
+  if (f->params.size() > limit) {
     LOG(WARNING) << "Probably you won't be able to execute your kernel due to high number of "
                     "buffers in the kernel";
   }
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 174ae6d43fc81..bb4d9519ba627 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -365,6 +365,11 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
     .add_attr_option<Integer>("max_num_threads", Integer(256))
     .add_attr_option<Integer>("thread_warp_size", Integer(1))
     .add_attr_option<Integer>("texture_spatial_limit", Integer(16384))
+    // Faced that Qualcomm OpenCL runtime was crashed without any error message in
+    // case when the number of kernel arguments was pretty big. OpenCL doesn't
+    // specify any limitations on the number of kernel arguments. max_function_args
+    // equals to 128 looks like a reasonable number of kernel arguments.
+    .add_attr_option<Integer>("max_function_args", Integer(128))
     .set_default_keys({"opencl", "gpu"});
 
 // The metal has some limitations on the number of input parameters. This is why attribute
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 06c93fbc5549e..4dd05ee782116 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -624,6 +624,12 @@ def expected(n, max_fused_ops):
 
     assert tvm.ir.structural_equal(zz, after)
 
+    with tvm.target.Target("opencl"):
+        with tvm.transform.PassContext(config={"relay.FuseOps.max_depth": max_fused_ops}):
+            cl_zz = run_opt_pass(z, transform.FuseOps())
+
+    assert tvm.ir.structural_equal(cl_zz, after)
+
 
 link_params = tvm.testing.parameter(False, True)
 
@@ -828,5 +834,114 @@ def expected():
         tvm.testing.assert_allclose(result, ref, rtol=1e-4, atol=1e-4)
 
 
+target_name = tvm.testing.parameter("opencl", "metal", "cuda")
+shape_type = tvm.testing.parameter("dynamic", "static")
+
+
+def test_fuse_max_num_args(target_name, shape_type):
+    if shape_type == "dynamic":
+        shape = (tvm.tir.Any(), 20)
+        number_of_any_dims = 1
+    else:
+        shape = (10, 20)
+        number_of_any_dims = 0
+    ndims = len(shape)
+    ops_num = 300
+
+    def _base_func(name):
+        x = relay.var(name, shape=shape)
+        w = relay.add(x, relay.const(1, "float32"))
+        return x, w
+
+    def before(n):
+        inp = []
+        out = []
+        for i in range(n):
+            x, w = _base_func(f"x{i}")
+            inp.append(x)
+            out.append(w)
+        w = out[0]
+        for i in range(len(out) - 1):
+            w = relay.add(w, out[i + 1])
+        return relay.Function(inp, w)
+
+    def after(n):
+        def create_fused_func(limit):
+            added_args = 0
+            inputs = []
+            input_vars = []
+            res = None
+            i = 0
+            while added_args < limit:
+                inp, out = _base_func(f"p{i}")
+
+                curr_args = 1 + number_of_any_dims
+                if number_of_any_dims > 0:
+                    curr_args += ndims
+
+                if added_args + curr_args > limit:
+                    f = relay.Function(inputs, res)
+                    f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+                    return i, input_vars, f
+
+                input_vars.append(relay.var(f"x{i}", shape=shape))
+                inputs.append(inp)
+                if res is None:
+                    res = out
+                else:
+                    res = relay.add(res, out)
+                added_args += curr_args
+                i += 1
+            f = relay.Function(inputs, res)
+            f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+            return i, input_vars, f
+
+        def create_accum_func(args_limit):
+            out = None
+            inputs = []
+            if args_limit == 0:
+                for i in range(n):
+                    inputs.append(relay.var(f"x{i}", shape=shape))
+                f = before(n)
+                f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+                out = relay.Call(f, inputs)
+                return relay.Function(inputs, out)
+
+            i, inputs, func = create_fused_func(args_limit)
+            out = relay.Call(func, inputs)
+            while i < n:
+                inp, func = _base_func(f"p{i}")
+                inputs.append(relay.var(f"xa{i}", shape=shape))
+                curr_args = 1 + number_of_any_dims
+                if number_of_any_dims > 0:
+                    curr_args += ndims
+                f = relay.Function([inp], func)
+                f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+                w = relay.Call(f, [inputs[-1]])
+                a = relay.var(f"a", shape=shape)
+                b = relay.var(f"b", shape=shape)
+                out_add = relay.add(a, b)
+                f = relay.Function([a, b], out_add)
+                f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+                out = relay.Call(f, [out, w])
+                i += 1
+            return relay.Function(inputs, out)
+
+        args_limit = tvm.target.Target.current().max_function_args - (
+            1 + number_of_any_dims
+        )  # one buffer with output
+        args_limit = max(args_limit, 0)
+        return create_accum_func(args_limit)
+
+    max_fused_ops = ops_num * 5
+    with tvm.target.Target(target_name):
+        with tvm.transform.PassContext(config={"relay.FuseOps.max_depth": max_fused_ops}):
+            fused = run_opt_pass(before(ops_num), transform.FuseOps())
+
+        expected = run_opt_pass(after(ops_num), transform.InferType())
+
+    assert tvm.ir.structural_equal(fused, expected)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relay/test_pass_split_args.py b/tests/python/relay/test_pass_split_args.py
index 2039f464751f6..508f74f11269c 100644
--- a/tests/python/relay/test_pass_split_args.py
+++ b/tests/python/relay/test_pass_split_args.py
@@ -22,6 +22,10 @@
 from tvm.relay.testing import run_infer_type, create_workload
 
 
+target_name = tvm.testing.parameter("opencl", "metal", "cuda")
+shape_type = tvm.testing.parameter("dynamic", "static")
+
+
 def run_opt_pass(expr, opt_pass):
     assert isinstance(opt_pass, tvm.transform.Pass)
 
@@ -32,65 +36,63 @@ def run_opt_pass(expr, opt_pass):
     return entry if isinstance(expr, relay.Function) else entry.body
 
 
-def test_split_concat_metal():
-    shape = (1, 1, 1, 3)
-    dtype = "float32"
-    axis = 1
-    inputs = []
-    for i in range(100):
-        inputs.append(relay.var("p{}".format(i), shape=shape, dtype=dtype))
-
-    def before():
-        inp = relay.Tuple(inputs)
-        return relay.op.concatenate(inp, axis)
-
-    def expected():
-        limit = tvm.target.Target("metal").max_function_args - 1  # one buffer with output
-        splitNum = int(len(inputs) / limit)
-        if len(inputs) % limit > 0:
-            splitNum += 1
-
-        splitted = []
-        for i in range(splitNum):
-            startIdx = i * limit
-            argsCount = min(limit, len(inputs) - startIdx)
-            args = []
-            for j in range(argsCount):
-                args.append(inputs[j + startIdx])
-            t = relay.Tuple(args)
-            concat = relay.op.concatenate(t, axis)
-            splitted.append(relay.annotation.stop_fusion(concat))
-        inp = relay.Tuple(splitted)
-        return relay.op.concatenate(inp, axis)
-
-    # the fold constant should work on any context.
-    res = run_opt_pass(before(), transform.SplitArgs(tvm.target.Target("metal").max_function_args))
-    exp = run_opt_pass(expected(), transform.InferType())
-    assert tvm.ir.structural_equal(res, exp)
-
-
-def test_split_concat_cuda():
-    shape = (1, 1, 1, 3)
+def test_split_concat(target_name, shape_type):
+    if shape_type == "dynamic":
+        shape = (tvm.tir.Any(), 1, 1, 3)
+        number_of_any_dims = 1
+    else:
+        shape = (1, 1, 1, 3)
+        number_of_any_dims = 0
+    ndims = len(shape)
     dtype = "float32"
     axis = 1
+    tensors_num = 300
     inputs = []
-    for i in range(100):
+    for i in range(tensors_num):
         inputs.append(relay.var("p{}".format(i), shape=shape, dtype=dtype))
 
     def before():
         inp = relay.Tuple(inputs)
         return relay.op.concatenate(inp, axis)
 
-    def expected():
-        inp = relay.Tuple(inputs)
-        return relay.op.concatenate(inp, axis)
+    def expected(limit):
+        if limit == 0:
+            return before()
+        limit = limit - 1  # one buffer with output
+        if number_of_any_dims > 0:
+            limit -= ndims
+
+        new_args = []
+        added_args = 0
+        num_inputs = 0
+        for inp in inputs:
+            curr_args = 1 + number_of_any_dims
+            if number_of_any_dims > 0:
+                curr_args += ndims
+            num_inputs += curr_args
+            if added_args + curr_args > limit:
+                t = relay.Tuple(new_args)
+                stop = relay.annotation.stop_fusion(t)
+                concat = relay.op.concatenate(stop, axis)
+                new_args = [concat]
+                added_args = curr_args
+            added_args += curr_args
+            new_args.append(inp)
+        t = relay.Tuple(new_args)
+        stop = relay.annotation.stop_fusion(t)
+        concat = relay.op.concatenate(stop, axis)
+
+        if num_inputs < limit:
+            return before()
+
+        return concat
 
     # the fold constant should work on any context.
-    res = run_opt_pass(before(), transform.SplitArgs(tvm.target.Target("cuda").max_function_args))
-    exp = run_opt_pass(expected(), transform.InferType())
+    limit = tvm.target.Target(target_name).max_function_args
+    res = run_opt_pass(before(), transform.SplitArgs(limit))
+    exp = run_opt_pass(expected(limit), transform.InferType())
     assert tvm.ir.structural_equal(res, exp)
 
 
 if __name__ == "__main__":
-    test_split_concat_metal()
-    test_split_concat_cuda()
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py
index 83612b7f59798..dcb43f29daf49 100644
--- a/tests/python/unittest/test_target_codegen_opencl.py
+++ b/tests/python/unittest/test_target_codegen_opencl.py
@@ -15,9 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te
+from tvm import te, relay
 import tvm.testing
 import re
+import pytest
+import numpy as np
 
 target = "opencl"
 
@@ -217,5 +219,211 @@ def _check(target, n, dtype):
     _check(target, 32, "float32")
 
 
+def _get_maximum_kernel_args(source):
+    def get_kernel_args(source):
+        import re
+
+        p = re.compile(r"__kernel void .+\((.*)\)")
+        args = p.findall(source)
+        return args
+
+    args = get_kernel_args(source)
+    max_args = len(args[0].split(","))
+    for arg_line in args:
+        max_args = max(max_args, len(arg_line.split(",")))
+    return max_args
+
+
+def _validate_opencl_executors(executor_type, get_model, ref_impl):
+    from tvm.contrib import graph_executor
+    from tvm.runtime.vm import VirtualMachine
+
+    input_dict, model = get_model()
+    if executor_type == "ge":
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(model, target_host="llvm", target=target)
+        ocl_lib = lib.get_lib()
+    else:
+        module = tvm.IRModule({})
+        module["main"] = model
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.vm.compile(module, target=target, target_host="llvm")
+        ocl_lib = lib.module.imported_modules[0]
+    opencl_modules = list(filter(lambda mod: mod.type_key == "opencl", ocl_lib.imported_modules))
+    assembly = opencl_modules[0].get_source()
+    with tvm.target.Target(target):
+        limit = tvm.target.Target.current().max_function_args
+    max_num = _get_maximum_kernel_args(assembly)
+    assert max_num <= limit
+
+    dev = tvm.cl()
+    if executor_type == "ge":
+        module = graph_executor.GraphModule(lib["default"](dev))
+        module.set_input(**input_dict)
+        module.run()
+        tvm_out = module.get_output(0)
+    else:
+        vm = VirtualMachine(lib, dev, "naive")
+        data = {}
+        for k, v in input_dict.items():
+            data[k] = tvm.nd.array(v, dev)
+        vm.set_input("main", **data)
+        vm.invoke_stateful("main")
+        tvm_out = vm.get_outputs()[0]
+
+    np_result = ref_impl(list(input_dict.values()))
+    np.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-2, atol=1e-2)
+
+
+shape_type = tvm.testing.parameter("dynamic", "static")
+executor_type = tvm.testing.parameter("ge", "vm")
+
+
+@tvm.testing.requires_gpu
+@tvm.testing.requires_opencl
+def test_opencl_args_split(executor_type, shape_type):
+    def _get_model():
+        if shape_type == "dynamic":
+            shape = (tvm.tir.Any(), 1, 1, 3)
+        else:
+            shape = (1, 1, 1, 3)
+        shape_np = (1, 1, 1, 3)
+        dtype = "float32"
+        axis = 1
+        tensors_num = 300
+        inputs = []
+        inputs_np = {}
+        for i in range(tensors_num):
+            inputs.append(relay.var("p{}".format(i), shape=shape, dtype=dtype))
+            inputs_np[f"p{i}"] = np.random.uniform(size=shape_np).astype(dtype)
+
+        inp = relay.Tuple(inputs)
+        concat = relay.op.concatenate(inp, axis)
+        return inputs_np, relay.Function(inputs, concat)
+
+    def ref_impl(inputs):
+        axis = 1
+        return np.concatenate(tuple(inputs), axis=axis)
+
+    if executor_type == "ge" and shape_type == "dynamic":
+        pytest.skip()
+    _validate_opencl_executors(executor_type, _get_model, ref_impl)
+
+
+@tvm.testing.requires_gpu
+@tvm.testing.requires_opencl
+def test_opencl_fuse_max_args(executor_type, shape_type):
+    if shape_type == "dynamic":
+        shape = (tvm.tir.Any(), 20)
+        ops_num = 80
+    else:
+        shape = (1, 20)
+        ops_num = 300
+    shape_np = (1, 20)
+    dtype = "float32"
+
+    def _base_func(name):
+        x = relay.var(name, shape=shape)
+        y = relay.add(x, relay.const(1, "float32"))
+        w = relay.exp(y)
+        return x, w
+
+    def _get_model():
+        inp = []
+        inputs_np = {}
+        out = []
+        for i in range(ops_num):
+            x, w = _base_func(f"x{i}")
+            inputs_np[f"x{i}"] = np.random.uniform(size=shape_np).astype(dtype)
+            inp.append(x)
+            out.append(w)
+        w = out[0]
+        for i in range(len(out) - 1):
+            w = relay.add(w, out[i + 1])
+        return inputs_np, relay.Function(inp, w)
+
+    def ref_impl(inputs):
+        w = np.exp(inputs[0] + 1)
+        for i in range(len(inputs) - 1):
+            w = w + np.exp(inputs[i + 1] + 1)
+        return w
+
+    if executor_type == "ge" and shape_type == "dynamic":
+        pytest.skip()
+    _validate_opencl_executors(executor_type, _get_model, ref_impl)
+
+
+@tvm.testing.requires_gpu
+@tvm.testing.requires_opencl
+def test_fuse_concat_max_num_args(executor_type, shape_type):
+    """
+    In this test before concat we have an operation with 3 inputs. In the
+    SplitArgs we cannot calculate these inputs as inputs to concat, because
+    they will be added to the concat after fusing operation. So FuseOps pass
+    should handle this case and stop fusing before concat.
+
+    The example:
+       x     y     z                  x     y     z
+       \     |     /                  \     |     /
+        \    |    /                    \    |    /
+           where            ...           where
+             |                              |
+            exp                            exp
+             \                              /
+              \                            /
+               \----->    concat    <-----/
+    """
+    if shape_type == "dynamic":
+        shape = (tvm.tir.Any(), 20)
+        ops_num = 80
+    else:
+        shape = (10, 20)
+        ops_num = 300
+    shape_np = (10, 20)
+    dtype = "float32"
+    axis = 1
+
+    def _base_func(name):
+        x = relay.var(name, shape=shape)
+        y = relay.var(f"y{name}", shape=shape)
+        z = relay.var(f"z{name}", shape=shape)
+        cond = relay.less(x, relay.const(1, "float32"))
+        l = relay.add(y, relay.const(1, "float32"))
+        r = relay.add(z, relay.const(5, "float32"))
+        w = relay.where(cond, l, r)
+        w = relay.exp(w)
+        return [x, y, z], w
+
+    def _get_model():
+        inp = []
+        out = []
+        inputs_np = {}
+        for i in range(ops_num):
+            inputs, w = _base_func(f"x{i}")
+            inputs_np[f"x{i}"] = np.random.uniform(size=shape_np).astype(dtype)
+            inputs_np[f"yx{i}"] = np.random.uniform(size=shape_np).astype(dtype)
+            inputs_np[f"zx{i}"] = np.random.uniform(size=shape_np).astype(dtype)
+            inp.extend(inputs)
+            out.append(w)
+        t = relay.Tuple(out)
+        w = relay.op.concatenate(t, axis)
+        return inputs_np, relay.Function(inp, w)
+
+    def ref_impl(inputs):
+        res = []
+        for i in range(0, len(inputs), 3):
+            x = inputs[i]
+            y = inputs[i + 1]
+            z = inputs[i + 2]
+            comp = np.where(x < 1, y + 1, z + 5)
+            comp = np.exp(comp)
+            res.append(comp)
+        return np.concatenate(tuple(res), axis=axis)
+
+    if executor_type == "ge" and shape_type == "dynamic":
+        pytest.skip()
+    _validate_opencl_executors(executor_type, _get_model, ref_impl)
+
+
 if __name__ == "__main__":
     tvm.testing.main()