Skip to content

Commit

Permalink
[Relay] Introduce arguments limit to FuseOps pass
Browse files Browse the repository at this point in the history
In PR apache#8313 a parameter `max_function_args` was introduced. It leads to
limit number of function argument and in case when this value is
exceeded then concatenation layer is split to a several concat
operations.

I faced a problem on Adreno GPU that for kernel with big number of
arguments the enqueueNDRange was crashed without any errors. The problem
appeared because of the huge number of arguments. But in this case not
only concat layer was a root cause of the problem. Also after fusing
several operations the final functions had a big number of arguments.

As it was discussed in apache#8313, adding a limitation on the number of
function arguments to the FuseOps pass might be a good improvement. In
this PR I introduced such mechanism for limitation number of function
arguments for FuseOps pass and add an arguments limit to OpenCL devices
at 128 parameters.
  • Loading branch information
echuraev committed Jul 5, 2023
1 parent b37ad17 commit 443c22d
Show file tree
Hide file tree
Showing 15 changed files with 601 additions and 91 deletions.
5 changes: 4 additions & 1 deletion include/tvm/relay/transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,12 @@ TVM_DLL Pass FoldConstant(bool fold_qnn = false);
/*!
* \brief Split function with huge number of arguments to smaller pieces.
*
* \param max_function_args Maximum number of function arguments. If it is 0 then SplitArgs won't
* split funciton.
*
* \return The pass.
*/
TVM_DLL Pass SplitArgs(int max_function_args);
TVM_DLL Pass SplitArgs(uint64_t max_function_args);

/*!
* \brief Fuse operations into expr into separate functions.
Expand Down
2 changes: 1 addition & 1 deletion include/tvm/topi/transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -722,7 +722,7 @@ inline te::Tensor dynamic_strided_slice(const te::Tensor& x, const te::Tensor& b
}

/*!
* \brief Calcluate the output shape of strided_slice, the entry point for Relay type relation
* \brief Calculate the output shape of strided_slice, the entry point for Relay type relation
*
* \param ishape The input tensor shape
* \param begin The indices to begin with in the slicing
Expand Down
8 changes: 7 additions & 1 deletion python/tvm/relay/transform/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1376,10 +1376,16 @@ def ToMixedPrecision(mixed_precision_type="float16", missing_op_mode=1):
def SplitArgs(max_function_args):
"""Split function with huge number of arguments to smaller pieces.
Parameters
----------
max_function_args: int
Maximum number of function arguments. If it is 0 then SplitArgs won't split function.
Returns
-------
ret : tvm.transform.Pass
The registered pass for constant folding.
The registered pass.
"""
return _ffi_api.SplitArgs(max_function_args)

Expand Down
2 changes: 1 addition & 1 deletion python/tvm/target/target.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def max_shared_memory_per_block(self):

@property
def max_function_args(self):
return int(self.attrs.get("max_function_args", -1))
return int(self.attrs.get("max_function_args", 0))

@property
def vtcm_capacity(self):
Expand Down
91 changes: 91 additions & 0 deletions src/relay/analysis/graph_partitioner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,88 @@ size_t GraphPartitioner::CountFusedNodesWithNewChild(IndexedForwardGraph::Node*
return target->FindRoot()->num_nodes + CountNodesUptoSink_(child, dom_parent);
}

size_t GraphPartitioner::CountAdditionalArgs_(const TensorTypeNode* ttype, bool with_strides) {
size_t any_dims = 0;
for (const auto& dim : ttype->shape) {
if (dim.as<AnyNode>()) {
any_dims++;
}
}
if (with_strides && any_dims > 0) any_dims += ttype->shape.size();
return any_dims;
}

size_t GraphPartitioner::CountArgs_(const tvm::Object* child, const tvm::Object* till_node) {
if (child == till_node) {
if (auto call_node = GetRef<ObjectRef>(child).as<CallNode>()) {
if (const auto* ttype = call_node->checked_type().as<TensorTypeNode>()) {
return CountAdditionalArgs_(ttype);
}
}
return 1;
}
// if (argsMap_.count(child)) {
// return argsMap_[child];
//}
size_t args_num = 0;
if (auto call_node = GetRef<ObjectRef>(child).as<CallNode>()) {
for (auto& it : call_node->args) {
if (it.as<CallNode>() || it.as<TupleNode>()) {
args_num += CountArgs_(it.get(), till_node);
} else if (it.as<VarNode>() || it.as<TupleGetItemNode>()) {
args_num++;
if (const auto* ttype = it->checked_type().as<TensorTypeNode>()) {
args_num += CountAdditionalArgs_(ttype);
}
}
}
} else if (GetRef<ObjectRef>(child).as<VarNode>() ||
GetRef<ObjectRef>(child).as<TupleGetItemNode>()) {
args_num++;
if (const auto* ttype =
GetRef<ObjectRef>(child).as<ExprNode>()->checked_type().as<TensorTypeNode>()) {
args_num += CountAdditionalArgs_(ttype);
}
} else if (auto tuple_node = GetRef<ObjectRef>(child).as<TupleNode>()) {
for (const auto& it : tuple_node->fields) {
args_num++;
args_num += CountArgs_(it.get(), till_node);
}
}
// argsMap_[child] = args_num;
return args_num;
}

size_t GraphPartitioner::CountArgsLimit_(const IndexedForwardGraph::Node* child) {
auto* outputs_list = child->outputs.head;
size_t output_args = 0;
while (outputs_list != nullptr) {
output_args++;
if (auto call_node = GetRef<ObjectRef>(outputs_list->value.node->ref).as<CallNode>()) {
if (const auto* ttype = call_node->checked_type().as<TensorTypeNode>()) {
output_args += CountAdditionalArgs_(ttype, false);
}
}
outputs_list = outputs_list->next;
}
return (max_function_args_ > output_args) ? max_function_args_ - output_args : 0;
}

size_t GraphPartitioner::CountFusedArgs(IndexedForwardGraph::Node* child,
IndexedForwardGraph::Node* till_node) {
const tvm::Object* till_node_ref = (till_node != nullptr) ? till_node->ref : nullptr;
auto* outputs_list = child->outputs.head;
size_t res = 1;
while (outputs_list != nullptr) {
size_t output_args = 0;
output_args += CountArgs_(outputs_list->value.node->ref, till_node_ref);
res = std::max(res, output_args);
// argsMap_.erase(outputs_list->value.node->ref);
outputs_list = outputs_list->next;
}
return res;
}

void GraphPartitioner::InitGroups(const IndexedForwardGraph& graph) {
groups_.resize(graph.post_dfs_order.size());
for (size_t nid = 0; nid < groups_.size(); ++nid) {
Expand All @@ -238,6 +320,7 @@ void GraphPartitioner::InitGroups(const IndexedForwardGraph& graph) {
void GraphPartitioner::RunFuse(const IndexedForwardGraph& graph, //
const DominatorTree& post_dom_tree, //
int phase) {
IndexedForwardGraph::Node* prev_node = nullptr;
for (size_t nid = 0; nid < groups_.size(); ++nid) {
// the group of current node has been specified already.
auto* graph_node = graph.post_dfs_order[nid];
Expand All @@ -254,6 +337,14 @@ void GraphPartitioner::RunFuse(const IndexedForwardGraph& graph, //
// refuse the fusion if too many ops are going to be fused together
if (CountFusedNodesWithNewChild(graph_node, dom_node->parent->gnode) > max_fuse_depth_)
continue;
// refuse the fusion if too many arguments are going to be in fused function
auto limit = CountArgsLimit_(graph_node);
if (limit > 0) {
if (CountFusedArgs(graph_node, prev_node) > limit) {
prev_node = graph_node;
continue;
}
}

if (phase == 2) {
// Fuse injective ops into intermediate tuples, if any
Expand Down
22 changes: 19 additions & 3 deletions src/relay/analysis/graph_partitioner.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class IndexedForwardGraph {
std::vector<Node*> post_dfs_order;

/*! \brief Dump the graph into string. */
void DebugDump() {
void DebugDump() const {
std::ostringstream os;
for (size_t i = 0; i < post_dfs_order.size(); ++i) {
Node* node = post_dfs_order[i];
Expand Down Expand Up @@ -162,8 +162,12 @@ class DominatorTree {
*/
class GraphPartitioner {
public:
explicit GraphPartitioner(support::Arena* arena, int opt_level, size_t max_fuse_depth)
: arena_(arena), opt_level_(opt_level), max_fuse_depth_(max_fuse_depth) {}
explicit GraphPartitioner(support::Arena* arena, int opt_level, size_t max_fuse_depth,
size_t max_function_args)
: arena_(arena),
opt_level_(opt_level),
max_fuse_depth_(max_fuse_depth),
max_function_args_(max_function_args) {}
/*!
* \brief Group as a union find data structure.
*/
Expand Down Expand Up @@ -205,10 +209,14 @@ class GraphPartitioner {
int opt_level_;
/*! \brief The maximum number of operations in one fused function */
size_t max_fuse_depth_;
/*! \brief The maximum number of arguments in one fused function */
size_t max_function_args_;
/*! \brief The internal groups. */
std::vector<Group*> groups_;
/*! \brief internal field used for deduplication */
std::unordered_set<IndexedForwardGraph::Node*> visited_;
/*! \brief internal field used for hashing arguments number for a node */
std::unordered_map<const tvm::Object*, size_t> argsMap_;
// Internal implementation of CheckPath
template <typename F>
bool CheckPath_(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink, F fcond);
Expand Down Expand Up @@ -247,6 +255,9 @@ class GraphPartitioner {
void CommitFuse(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink);

size_t CountNodesUptoSink_(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink);
size_t CountAdditionalArgs_(const TensorTypeNode* ttype, bool with_strides = true);
size_t CountArgs_(const tvm::Object* child, const tvm::Object* till_node);
size_t CountArgsLimit_(const IndexedForwardGraph::Node* child);

// Count the number of nodes in a fused subgraph if child is additionally fused.
// dom_parent is already known to be a part of the subgraph.
Expand All @@ -256,6 +267,11 @@ class GraphPartitioner {
// is important for correct calculation.
size_t CountFusedNodesWithNewChild(IndexedForwardGraph::Node* child,
IndexedForwardGraph::Node* dom_parent);
// Count the number of arguments in a fused subgraph if child is additionally fused.
// Calculation goes from child node till the till_node. If till_node is a
// nullptr then calculate arguments till the beginning of the graph.
size_t CountFusedArgs(IndexedForwardGraph::Node* child,
IndexedForwardGraph::Node* till_node = nullptr);

// Initialize the groups.
void InitGroups(const IndexedForwardGraph& graph);
Expand Down
2 changes: 1 addition & 1 deletion src/relay/backend/build_module.cc
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ class RelayBuildModule : public runtime::ModuleNode {
if (config_->optional_homogeneous_target.defined()) {
// This pass currently only supports the homogeneous case.
pass_seqs.push_back(transform::SplitArgs(
config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", -1)
config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", 0)
.value()
.IntValue()));
}
Expand Down
7 changes: 7 additions & 0 deletions src/relay/backend/vm/compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1059,6 +1059,13 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
// Always plan devices so the remaining passes don't need to distinguish homogeneous vs
// heterogeneous execution.
pass_seqs.push_back(transform::PlanDevices(config_));
if (config_->optional_homogeneous_target.defined()) {
// This pass currently only supports the homogeneous case.
pass_seqs.push_back(transform::SplitArgs(
config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", 0)
.value()
.IntValue()));
}

pass_seqs.push_back(transform::FuseOps());

Expand Down
23 changes: 16 additions & 7 deletions src/relay/transforms/fuse_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -319,9 +319,10 @@ class IndexedForwardGraphCreator : private ExprVisitor {

class FuseMutator : private MixedModeMutator {
public:
FuseMutator(int fuse_opt_level, size_t max_fuse_depth, bool link_params)
FuseMutator(int fuse_opt_level, size_t max_fuse_depth, size_t max_function_args, bool link_params)
: fuse_opt_level_(fuse_opt_level),
max_fuse_depth_(max_fuse_depth),
max_function_args_(max_function_args),
link_params_(link_params) {}

// Run the transform
Expand All @@ -334,7 +335,8 @@ class FuseMutator : private MixedModeMutator {
Expr Transform(const Expr& body, int fuse_opt_level, size_t max_fuse_depth, bool link_params) {
// setup the group map.
auto graph = IndexedForwardGraphCreator::Create(&arena_, body);
auto groups = GraphPartitioner(&arena_, fuse_opt_level, max_fuse_depth).Partition(graph);
auto groups = GraphPartitioner(&arena_, fuse_opt_level, max_fuse_depth, max_function_args_)
.Partition(graph);
for (size_t nid = 0; nid < graph.post_dfs_order.size(); ++nid) {
ICHECK(graph.post_dfs_order[nid]->ref != nullptr);
gmap_[graph.post_dfs_order[nid]->ref] = groups[nid];
Expand All @@ -347,6 +349,7 @@ class FuseMutator : private MixedModeMutator {
private:
int fuse_opt_level_;
size_t max_fuse_depth_;
size_t max_function_args_;
bool link_params_;

using MixedModeMutator::VisitExpr_;
Expand Down Expand Up @@ -548,9 +551,10 @@ class FuseMutator : private MixedModeMutator {
}
};

Expr FuseOps(const Expr& expr, int fuse_opt_level, size_t max_fuse_depth, bool link_params,
const IRModule& module) {
return FuseMutator(fuse_opt_level, max_fuse_depth, link_params).Transform(expr);
Expr FuseOps(const Expr& expr, int fuse_opt_level, size_t max_fuse_depth, size_t max_function_args,
bool link_params, const IRModule& module) {
return FuseMutator(fuse_opt_level, max_fuse_depth, max_function_args, link_params)
.Transform(expr);
}

namespace transform {
Expand All @@ -567,8 +571,13 @@ Pass FuseOps(int fuse_opt_level) {
link_params = pc->GetConfig("relay.FuseOps.link_params", Bool(link_params)).value();
int opt_level = fuse_opt_level == -1 ? pc->opt_level : fuse_opt_level;
auto max_fuse_depth = pc->GetConfig("relay.FuseOps.max_depth", Integer(kMaxFusedOps));
return Downcast<Function>(
FuseOps(f, opt_level, max_fuse_depth.value().IntValue(), link_params, m));
auto target = Target::Current();
size_t max_function_args =
(target.defined())
? target->GetAttr<Integer>("max_function_args", Integer(0)).value().IntValue()
: 0;
return Downcast<Function>(FuseOps(f, opt_level, max_fuse_depth.value().IntValue(),
max_function_args, link_params, m));
};
return CreateFunctionPass(pass_func, 0, "FuseOps", {"InferType"});
}
Expand Down
Loading

0 comments on commit 443c22d

Please sign in to comment.