[1.x][FEATURE] CUDA graphs support (apache#19142)

* Initial cherry-pick * Store NodeAttrs in OpExecutor * Do not allow stateful operations in CUDA graphs and provide mechanism for marking ops as safe * Guard against using ops with synchronization * Cleaning * Properly guard graphs * Limit graphs to CUDA 10.2+ * Fix the compilation when graphs are not available * Guarding the libcuda.so usage behind RTC compilation flag * Document the env variables * Add test * Fix the test * Use with_environment
DickJC123 · Jun 1, 2021 · f4bcd48 · f4bcd48
1 parent 57d0ace
commit f4bcd48
Show file tree

Hide file tree

Showing 26 changed files with 860 additions and 35 deletions.
diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md
@@ -170,6 +170,16 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD
   - Values: Int ```(default=<value of MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN>)```
   - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the backward pass.
+* MXNET_ENABLE_CUDA_GRAPHS
+  - Values: 0(false) or 1(true) ```(default=0)```
+  - If set to `1`, MXNet will utilize CUDA graphs when executing models on the GPU when possible.
+  - For CUDA graphs execution, one needs to use either symbolic model or Gluon model hybridized with options `static_alloc` and `static_shape` set to True.
+* MXNET_CUDA_GRAPHS_VERBOSE
+  - Values: 0(false) or  1(true) ```(default=0)```
+  - If set to `1`, CUDA graphs executor will provide information about the graph being captured and executed.
+* MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES
+  - Values: Int ```(default=0)```
+  - The maximum number of log messages generated by CUDA graphs executor.
 
 ## Control the Data Communication
 

diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
@@ -362,6 +362,19 @@ using FNeedCalibrateInput = std::function<std::vector<int> (const NodeAttrs& att
  */
 using FNeedCalibrateOutput = std::function<std::vector<int> (const NodeAttrs& attrs)>;
 
+#if MXNET_USE_CUDA
+
+/*!
+ * \brief Register a function to determine if
+ * the operator implementation is compatible
+ * with CUDA graphs. This requires the execution
+ * to stay the same as long as the shape and type
+ * of input stays the same.
+ */
+using FIsCUDAGraphsCompatible = std::function<bool (const NodeAttrs& attrs, const bool is_train)>;
+
+#endif
+
 }  // namespace mxnet
 
 #endif  // MXNET_OP_ATTR_TYPES_H_
diff --git a/src/imperative/attach_op_execs_pass.cc b/src/imperative/attach_op_execs_pass.cc
@@ -49,8 +49,10 @@ namespace exec {
 // FComputeExecutor and FStatefulComputeExecutor inherit from this class
 class StorageFallbackOpExecutor : public OpExecutor {
  public:
-  explicit StorageFallbackOpExecutor(std::vector<uint32_t> mutate_idx)
-      : mutate_idx_(std::move(mutate_idx)) {}
+  explicit StorageFallbackOpExecutor(const NodeAttrs& attrs,
+                            DispatchMode dispatch_mode,
+                            std::vector<uint32_t> mutate_idx)
+      : OpExecutor(attrs, dispatch_mode), mutate_idx_(std::move(mutate_idx)) {}
 
   void Setup() override {
     init_ = false;
@@ -137,11 +139,13 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     return state_;
   }
 
-  explicit StatefulComputeExecutor(OpStatePtr  state,
-                                   FStatefulCompute  fcompute,
-                                   ExecType exec_type,
-                                   const std::vector<uint32_t> &mutate_idx)
-      : StorageFallbackOpExecutor(mutate_idx),
+  explicit StatefulComputeExecutor(const NodeAttrs& attrs,
+                          DispatchMode dispatch_mode,
+                          OpStatePtr state,
+                          FStatefulCompute fcompute,
+                          ExecType exec_type,
+                          const std::vector<uint32_t> &mutate_idx)
+      : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx),
         state_(std::move(state)), fcompute_(std::move(fcompute)), exec_type_(exec_type) {}
 
  private:
@@ -158,7 +162,7 @@ class StatefulComputeExExecutor : public OpExecutor {
     op_ctx.run_ctx = rctx;
     INVALIDATE_OUTPUTS(out_array, req);
     std::vector<NDArray> *pInArray = &in_array;
-    CREATE_DEFAULT_INPUTS_MKLDNN(in_array, pInArray = &in_array_fallback, attrs_);
+    CREATE_DEFAULT_INPUTS_MKLDNN(in_array, pInArray = &in_array_fallback, attrs);
     fcompute_(state_, op_ctx, *pInArray, req, out_array);
   }
 
@@ -176,15 +180,15 @@ class StatefulComputeExExecutor : public OpExecutor {
     return state_;
   }
 
-  explicit StatefulComputeExExecutor(NodeAttrs  attrs,
-                                     OpStatePtr  state,
-                                     FStatefulComputeEx  fcompute,
+  explicit StatefulComputeExExecutor(const NodeAttrs& attrs,
+                                     DispatchMode dispatch_mode,
+                                     OpStatePtr state,
+                                     FStatefulComputeEx fcompute,
                                      ExecType exec_type)
-      : attrs_(std::move(attrs)), state_(std::move(state)), fcompute_(std::move(fcompute)),
+      : OpExecutor(attrs, dispatch_mode), state_(std::move(state)), fcompute_(std::move(fcompute)),
         exec_type_(exec_type) {}
 
  private:
-  NodeAttrs attrs_;
   OpStatePtr state_;
   FStatefulComputeEx fcompute_;
   ExecType exec_type_;
@@ -199,22 +203,22 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     op_ctx.run_ctx = rctx;
     INVALIDATE_OUTPUTS(out_array, req);
     PreFCompute(is_gpu);
-    fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
+    fcompute_(attrs, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
   }
 
   ExecType exec_type() const override {
     return exec_type_;
   }
 
-  explicit FComputeExecutor(NodeAttrs  attrs, FCompute fcompute,
-                            ExecType exec_type, const std::vector<uint32_t> &mutate_idx)
-      : StorageFallbackOpExecutor(mutate_idx),
-        attrs_(std::move(attrs)), fcompute_(std::move(fcompute)), exec_type_(exec_type) {
+  explicit FComputeExecutor(const NodeAttrs& attrs, DispatchMode dispatch_mode,
+                            FCompute fcompute, ExecType exec_type,
+                            const std::vector<uint32_t> &mutate_idx)
+      : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx),
+    fcompute_(std::move(fcompute)), exec_type_(exec_type) {
   }
 
  private:
-  NodeAttrs attrs_;
   FCompute fcompute_;
   ExecType exec_type_;
 };
@@ -226,8 +230,8 @@ class FComputeExExecutor : public OpExecutor {
     op_ctx.run_ctx = rctx;
     INVALIDATE_OUTPUTS(out_array, req);
     std::vector<NDArray> *pInArray = &in_array;
-    CREATE_DEFAULT_INPUTS_MKLDNN(in_array, pInArray = &in_array_fallback, attrs_);
-    fcompute_(attrs_, op_ctx, *pInArray, req, out_array);
+    CREATE_DEFAULT_INPUTS_MKLDNN(in_array, pInArray = &in_array_fallback, attrs);
+    fcompute_(attrs, op_ctx, *pInArray, req, out_array);
   }
 
   void Setup() override {}
@@ -236,13 +240,12 @@ class FComputeExExecutor : public OpExecutor {
     return exec_type_;
   }
 
-  explicit FComputeExExecutor(NodeAttrs  attrs, FComputeEx fcompute,
-                              ExecType exec_type)
-      : attrs_(std::move(attrs)), fcompute_(std::move(fcompute)), exec_type_(exec_type) {
+  explicit FComputeExExecutor(const NodeAttrs& attrs, DispatchMode dispatch_mode,
+                              FComputeEx fcompute, ExecType exec_type)
+    : OpExecutor(attrs, dispatch_mode), fcompute_(std::move(fcompute)), exec_type_(exec_type) {
   }
 
  private:
-  NodeAttrs attrs_;
   FComputeEx fcompute_;
   ExecType exec_type_;
 };
@@ -296,15 +299,18 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
         op, "FStatefulComputeEx", vctx[i]);
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs, state,
+      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs,
+                                                           dispatch_modes[i], state,
                                                            fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
           op, "FStatefulCompute", vctx[i]);
       CHECK(fcompute != nullptr)
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute,
+      ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
+                                                         dispatch_modes[i],
+                                                         state, fcompute,
                                                          exec_type, mutate_index);
     }
   } else if (is_layer_backward.get(op, false)) {
@@ -317,26 +323,27 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
       ret[i] = std::make_shared<StatefulComputeExExecutor>(
-          inode.source->attrs, ret[fwd_id].get()->state(), fcompute_ex,
-          exec_type);
+          inode.source->attrs, dispatch_modes[i], ret[fwd_id].get()->state(),
+          fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
           op, "FStatefulCompute", vctx[i]);
       CHECK(fcompute != nullptr)
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(
-          ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index);
+      ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
+          dispatch_modes[i], ret[fwd_id].get()->state(), fcompute, exec_type,
+          mutate_index);
     }
   } else {
     FCompute fcompute = common::GetFCompute<FCompute>(op, "FCompute", vctx[i]);
     FComputeEx fcomp_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", vctx[i]);
     if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
       ret[i] = std::make_shared<FComputeExExecutor>(
-          inode.source->attrs, fcomp_ex, exec_type);
+          inode.source->attrs, dispatch_modes[i], fcomp_ex, exec_type);
     } else if (fcompute != nullptr) {
       ret[i] = std::make_shared<FComputeExecutor>(
-          inode.source->attrs, fcompute, exec_type, mutate_index);
+          inode.source->attrs, dispatch_modes[i], fcompute, exec_type, mutate_index);
     } else {
       LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name;
     }