Skip to content

Commit

Permalink
[1.x][FEATURE] CUDA graphs support (apache#19142)
Browse files Browse the repository at this point in the history
* Initial cherry-pick

* Store NodeAttrs in OpExecutor

* Do not allow stateful operations in CUDA graphs and provide mechanism
for marking ops as safe

* Guard against using ops with synchronization

* Cleaning

* Properly guard graphs

* Limit graphs to CUDA 10.2+

* Fix the compilation when graphs are not available

* Guarding the libcuda.so usage behind RTC compilation flag

* Document the env variables

* Add test

* Fix the test

* Use with_environment
  • Loading branch information
ptrendx authored and DickJC123 committed Jun 1, 2021
1 parent 57d0ace commit f4bcd48
Show file tree
Hide file tree
Showing 26 changed files with 860 additions and 35 deletions.
10 changes: 10 additions & 0 deletions docs/static_site/src/pages/api/faq/env_var.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,16 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
* MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD
- Values: Int ```(default=<value of MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN>)```
- The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the backward pass.
* MXNET_ENABLE_CUDA_GRAPHS
- Values: 0(false) or 1(true) ```(default=0)```
- If set to `1`, MXNet will utilize CUDA graphs when executing models on the GPU when possible.
- For CUDA graphs execution, one needs to use either symbolic model or Gluon model hybridized with options `static_alloc` and `static_shape` set to True.
* MXNET_CUDA_GRAPHS_VERBOSE
- Values: 0(false) or 1(true) ```(default=0)```
- If set to `1`, CUDA graphs executor will provide information about the graph being captured and executed.
* MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES
- Values: Int ```(default=0)```
- The maximum number of log messages generated by CUDA graphs executor.

## Control the Data Communication

Expand Down
13 changes: 13 additions & 0 deletions include/mxnet/op_attr_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,19 @@ using FNeedCalibrateInput = std::function<std::vector<int> (const NodeAttrs& att
*/
using FNeedCalibrateOutput = std::function<std::vector<int> (const NodeAttrs& attrs)>;

#if MXNET_USE_CUDA

/*!
* \brief Register a function to determine if
* the operator implementation is compatible
* with CUDA graphs. This requires the execution
* to stay the same as long as the shape and type
* of input stays the same.
*/
using FIsCUDAGraphsCompatible = std::function<bool (const NodeAttrs& attrs, const bool is_train)>;

#endif

} // namespace mxnet

#endif // MXNET_OP_ATTR_TYPES_H_
73 changes: 40 additions & 33 deletions src/imperative/attach_op_execs_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,10 @@ namespace exec {
// FComputeExecutor and FStatefulComputeExecutor inherit from this class
class StorageFallbackOpExecutor : public OpExecutor {
public:
explicit StorageFallbackOpExecutor(std::vector<uint32_t> mutate_idx)
: mutate_idx_(std::move(mutate_idx)) {}
explicit StorageFallbackOpExecutor(const NodeAttrs& attrs,
DispatchMode dispatch_mode,
std::vector<uint32_t> mutate_idx)
: OpExecutor(attrs, dispatch_mode), mutate_idx_(std::move(mutate_idx)) {}

void Setup() override {
init_ = false;
Expand Down Expand Up @@ -137,11 +139,13 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
return state_;
}

explicit StatefulComputeExecutor(OpStatePtr state,
FStatefulCompute fcompute,
ExecType exec_type,
const std::vector<uint32_t> &mutate_idx)
: StorageFallbackOpExecutor(mutate_idx),
explicit StatefulComputeExecutor(const NodeAttrs& attrs,
DispatchMode dispatch_mode,
OpStatePtr state,
FStatefulCompute fcompute,
ExecType exec_type,
const std::vector<uint32_t> &mutate_idx)
: StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx),
state_(std::move(state)), fcompute_(std::move(fcompute)), exec_type_(exec_type) {}

private:
Expand All @@ -158,7 +162,7 @@ class StatefulComputeExExecutor : public OpExecutor {
op_ctx.run_ctx = rctx;
INVALIDATE_OUTPUTS(out_array, req);
std::vector<NDArray> *pInArray = &in_array;
CREATE_DEFAULT_INPUTS_MKLDNN(in_array, pInArray = &in_array_fallback, attrs_);
CREATE_DEFAULT_INPUTS_MKLDNN(in_array, pInArray = &in_array_fallback, attrs);
fcompute_(state_, op_ctx, *pInArray, req, out_array);
}

Expand All @@ -176,15 +180,15 @@ class StatefulComputeExExecutor : public OpExecutor {
return state_;
}

explicit StatefulComputeExExecutor(NodeAttrs attrs,
OpStatePtr state,
FStatefulComputeEx fcompute,
explicit StatefulComputeExExecutor(const NodeAttrs& attrs,
DispatchMode dispatch_mode,
OpStatePtr state,
FStatefulComputeEx fcompute,
ExecType exec_type)
: attrs_(std::move(attrs)), state_(std::move(state)), fcompute_(std::move(fcompute)),
: OpExecutor(attrs, dispatch_mode), state_(std::move(state)), fcompute_(std::move(fcompute)),
exec_type_(exec_type) {}

private:
NodeAttrs attrs_;
OpStatePtr state_;
FStatefulComputeEx fcompute_;
ExecType exec_type_;
Expand All @@ -199,22 +203,22 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
op_ctx.run_ctx = rctx;
INVALIDATE_OUTPUTS(out_array, req);
PreFCompute(is_gpu);
fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
fcompute_(attrs, op_ctx, in_data_, req, out_data_);
PostFCompute(is_gpu);
}

ExecType exec_type() const override {
return exec_type_;
}

explicit FComputeExecutor(NodeAttrs attrs, FCompute fcompute,
ExecType exec_type, const std::vector<uint32_t> &mutate_idx)
: StorageFallbackOpExecutor(mutate_idx),
attrs_(std::move(attrs)), fcompute_(std::move(fcompute)), exec_type_(exec_type) {
explicit FComputeExecutor(const NodeAttrs& attrs, DispatchMode dispatch_mode,
FCompute fcompute, ExecType exec_type,
const std::vector<uint32_t> &mutate_idx)
: StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx),
fcompute_(std::move(fcompute)), exec_type_(exec_type) {
}

private:
NodeAttrs attrs_;
FCompute fcompute_;
ExecType exec_type_;
};
Expand All @@ -226,8 +230,8 @@ class FComputeExExecutor : public OpExecutor {
op_ctx.run_ctx = rctx;
INVALIDATE_OUTPUTS(out_array, req);
std::vector<NDArray> *pInArray = &in_array;
CREATE_DEFAULT_INPUTS_MKLDNN(in_array, pInArray = &in_array_fallback, attrs_);
fcompute_(attrs_, op_ctx, *pInArray, req, out_array);
CREATE_DEFAULT_INPUTS_MKLDNN(in_array, pInArray = &in_array_fallback, attrs);
fcompute_(attrs, op_ctx, *pInArray, req, out_array);
}

void Setup() override {}
Expand All @@ -236,13 +240,12 @@ class FComputeExExecutor : public OpExecutor {
return exec_type_;
}

explicit FComputeExExecutor(NodeAttrs attrs, FComputeEx fcompute,
ExecType exec_type)
: attrs_(std::move(attrs)), fcompute_(std::move(fcompute)), exec_type_(exec_type) {
explicit FComputeExExecutor(const NodeAttrs& attrs, DispatchMode dispatch_mode,
FComputeEx fcompute, ExecType exec_type)
: OpExecutor(attrs, dispatch_mode), fcompute_(std::move(fcompute)), exec_type_(exec_type) {
}

private:
NodeAttrs attrs_;
FComputeEx fcompute_;
ExecType exec_type_;
};
Expand Down Expand Up @@ -296,15 +299,18 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
op, "FStatefulComputeEx", vctx[i]);
// FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs, state,
ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs,
dispatch_modes[i], state,
fcompute_ex, exec_type);
} else {
FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
op, "FStatefulCompute", vctx[i]);
CHECK(fcompute != nullptr)
<< "One of FStatefulCompute and FStatefulComputeEx must be registered "
<< "for stateful operator " << op->name;
ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute,
ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
dispatch_modes[i],
state, fcompute,
exec_type, mutate_index);
}
} else if (is_layer_backward.get(op, false)) {
Expand All @@ -317,26 +323,27 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
// FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
ret[i] = std::make_shared<StatefulComputeExExecutor>(
inode.source->attrs, ret[fwd_id].get()->state(), fcompute_ex,
exec_type);
inode.source->attrs, dispatch_modes[i], ret[fwd_id].get()->state(),
fcompute_ex, exec_type);
} else {
FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
op, "FStatefulCompute", vctx[i]);
CHECK(fcompute != nullptr)
<< "One of FStatefulCompute and FStatefulComputeEx must be registered "
<< "for stateful operator " << op->name;
ret[i] = std::make_shared<StatefulComputeExecutor>(
ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index);
ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
dispatch_modes[i], ret[fwd_id].get()->state(), fcompute, exec_type,
mutate_index);
}
} else {
FCompute fcompute = common::GetFCompute<FCompute>(op, "FCompute", vctx[i]);
FComputeEx fcomp_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", vctx[i]);
if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
ret[i] = std::make_shared<FComputeExExecutor>(
inode.source->attrs, fcomp_ex, exec_type);
inode.source->attrs, dispatch_modes[i], fcomp_ex, exec_type);
} else if (fcompute != nullptr) {
ret[i] = std::make_shared<FComputeExecutor>(
inode.source->attrs, fcompute, exec_type, mutate_index);
inode.source->attrs, dispatch_modes[i], fcompute, exec_type, mutate_index);
} else {
LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name;
}
Expand Down
Loading

0 comments on commit f4bcd48

Please sign in to comment.