From 371f4023648532f73f7b000e9c33b5b3cd600977 Mon Sep 17 00:00:00 2001
From: Chi Lo <Chi.Lo@microsoft.com>
Date: Thu, 28 Sep 2023 23:41:59 +0000
Subject: [PATCH 1/5] update

---
 .../providers/tensorrt/tensorrt_execution_provider.cc     | 8 +++++++-
 .../core/providers/tensorrt/tensorrt_execution_provider.h | 4 ++++
 2 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 55204abc80187..e14855fb6df90 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1865,6 +1865,7 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
   } else if (number_of_trt_nodes == number_of_ort_nodes) {
     LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider";
   } else {
+    sync_stream_before_enqueue_ = true;
     LOGS_DEFAULT(INFO) << "[TensorRT EP] Graph is partitioned and number of subgraphs running on TensorRT execution provider is " << number_of_subgraphs;
   }
 
@@ -2383,7 +2384,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       *p = {context->allocate_func, context->release_func, context->allocator_handle, context->node_name,
             &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name],
             &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
-            input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
+            input_shape_ranges_[context->node_name], sync_stream_before_enqueue_, &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
             dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
             runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_,
             dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
@@ -2411,6 +2412,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
       const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
       const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
+      bool sync_stream_before_enqueue = trt_state->sync_stream_before_enqueue;
       auto fused_node_name = trt_state->fused_node_name;
       auto& shape_ranges = trt_state->input_shape_ranges;
       auto trt_builder = trt_state->builder->get();
@@ -3013,6 +3015,10 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         CaptureBegin();
       }
 
+      if (sync_stream_before_enqueue)
+        cudaStreamSynchronize(stream);
+      }
+
       // Run TRT inference
       if (!trt_context->enqueueV2(&buffers[0], stream, nullptr)) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed.");
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index 64ab2db2aedc9..c393a9dcf4f7b 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -111,6 +111,7 @@ struct TensorrtFuncState {
   std::vector<std::unordered_map<std::string, size_t>> input_info;
   std::vector<std::unordered_map<std::string, size_t>> output_info;
   std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> input_shape_ranges;
+  bool sync_stream_before_enqueue = false;
   OrtMutex* tensorrt_mu_ptr = nullptr;
   bool fp16_enable = false;
   bool int8_enable = false;
@@ -262,6 +263,9 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   cudnnHandle_t external_cudnn_handle_ = nullptr;
   cublasHandle_t external_cublas_handle_ = nullptr;
 
+  // Call cudaStreamSynchronize() before TRT enqueueV2()/enqueueV3()
+  sync_stream_before_enqueue_ = false;
+
   CUDAGraph cuda_graph_;
   bool is_graph_captured_ = false;
   int regular_run_count_before_graph_capture_ = 0;

From 7e35a8690ccd3d2ef3c03e87ae9e8deb17f6d1c9 Mon Sep 17 00:00:00 2001
From: Chi Lo <Chi.Lo@microsoft.com>
Date: Mon, 2 Oct 2023 19:16:34 +0000
Subject: [PATCH 2/5] update

---
 .../core/providers/tensorrt/tensorrt_execution_provider.cc      | 2 +-
 .../core/providers/tensorrt/tensorrt_execution_provider.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index e14855fb6df90..535f7bb29ea29 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -3015,7 +3015,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         CaptureBegin();
       }
 
-      if (sync_stream_before_enqueue)
+      if (sync_stream_before_enqueue) {
         cudaStreamSynchronize(stream);
       }
 
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index c393a9dcf4f7b..a7e612a7dc6e8 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -264,7 +264,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   cublasHandle_t external_cublas_handle_ = nullptr;
 
   // Call cudaStreamSynchronize() before TRT enqueueV2()/enqueueV3()
-  sync_stream_before_enqueue_ = false;
+  mutable bool sync_stream_before_enqueue_ = false;
 
   CUDAGraph cuda_graph_;
   bool is_graph_captured_ = false;

From 3a34f89b91a74b793535d0fe1bbb7868868cda43 Mon Sep 17 00:00:00 2001
From: Chi Lo <Chi.Lo@microsoft.com>
Date: Tue, 3 Oct 2023 01:52:01 +0000
Subject: [PATCH 3/5] update

---
 .../providers/tensorrt/tensorrt_execution_provider.cc     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 535f7bb29ea29..17ad4fb97c7b7 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -3015,15 +3015,15 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         CaptureBegin();
       }
 
-      if (sync_stream_before_enqueue) {
-        cudaStreamSynchronize(stream);
-      }
-
       // Run TRT inference
       if (!trt_context->enqueueV2(&buffers[0], stream, nullptr)) {
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed.");
       }
 
+      if (sync_stream_before_enqueue) {
+        cudaStreamSynchronize(stream);
+      }
+
       // Cast INT64 input to INT32 because TensorRT doesn't fully support INT64
       for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
         const std::string& output_name = output_binding_names[i];

From 54a6bbf560708216cbdef4681a6670a2bc29520f Mon Sep 17 00:00:00 2001
From: Chi Lo <Chi.Lo@microsoft.com>
Date: Thu, 19 Oct 2023 06:39:54 +0000
Subject: [PATCH 4/5] update

---
 .../providers/tensorrt/tensorrt_execution_provider.cc     | 8 ++++----
 .../core/providers/tensorrt/tensorrt_execution_provider.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index 17ad4fb97c7b7..2c1db247cb526 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1865,7 +1865,7 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph,
   } else if (number_of_trt_nodes == number_of_ort_nodes) {
     LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider";
   } else {
-    sync_stream_before_enqueue_ = true;
+    sync_stream_after_enqueue_ = true;
     LOGS_DEFAULT(INFO) << "[TensorRT EP] Graph is partitioned and number of subgraphs running on TensorRT execution provider is " << number_of_subgraphs;
   }
 
@@ -2384,7 +2384,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       *p = {context->allocate_func, context->release_func, context->allocator_handle, context->node_name,
             &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name],
             &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
-            input_shape_ranges_[context->node_name], sync_stream_before_enqueue_, &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
+            input_shape_ranges_[context->node_name], sync_stream_after_enqueue_, &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
             dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
             runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_,
             dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
@@ -2412,7 +2412,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
       const std::unordered_map<std::string, size_t>& input_indexes = (trt_state->input_info)[0];
       const std::unordered_map<std::string, size_t>& output_indexes = (trt_state->output_info)[0];
       const std::unordered_map<std::string, size_t>& output_types = (trt_state->output_info)[1];
-      bool sync_stream_before_enqueue = trt_state->sync_stream_before_enqueue;
+      bool sync_stream_after_enqueue = trt_state->sync_stream_after_enqueue;
       auto fused_node_name = trt_state->fused_node_name;
       auto& shape_ranges = trt_state->input_shape_ranges;
       auto trt_builder = trt_state->builder->get();
@@ -3020,7 +3020,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
         return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed.");
       }
 
-      if (sync_stream_before_enqueue) {
+      if (sync_stream_after_enqueue) {
         cudaStreamSynchronize(stream);
       }
 
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index a7e612a7dc6e8..b8b82f650c7e3 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -111,7 +111,7 @@ struct TensorrtFuncState {
   std::vector<std::unordered_map<std::string, size_t>> input_info;
   std::vector<std::unordered_map<std::string, size_t>> output_info;
   std::unordered_map<std::string, std::unordered_map<size_t, std::vector<std::vector<int64_t>>>> input_shape_ranges;
-  bool sync_stream_before_enqueue = false;
+  bool sync_stream_after_enqueue = false;
   OrtMutex* tensorrt_mu_ptr = nullptr;
   bool fp16_enable = false;
   bool int8_enable = false;
@@ -264,7 +264,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   cublasHandle_t external_cublas_handle_ = nullptr;
 
   // Call cudaStreamSynchronize() before TRT enqueueV2()/enqueueV3()
-  mutable bool sync_stream_before_enqueue_ = false;
+  mutable bool sync_stream_after_enqueue_ = false;
 
   CUDAGraph cuda_graph_;
   bool is_graph_captured_ = false;

From 8168a432a146477f4e530ba88d8e07d7d216b664 Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Thu, 19 Oct 2023 15:07:16 -0700
Subject: [PATCH 5/5] Fix comment

---
 .../core/providers/tensorrt/tensorrt_execution_provider.h       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index b8b82f650c7e3..3bf6bc05a65df 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -263,7 +263,7 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   cudnnHandle_t external_cudnn_handle_ = nullptr;
   cublasHandle_t external_cublas_handle_ = nullptr;
 
-  // Call cudaStreamSynchronize() before TRT enqueueV2()/enqueueV3()
+  // Call cudaStreamSynchronize() after TRT enqueueV2()/enqueueV3()
   mutable bool sync_stream_after_enqueue_ = false;
 
   CUDAGraph cuda_graph_;