From 371f4023648532f73f7b000e9c33b5b3cd600977 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Thu, 28 Sep 2023 23:41:59 +0000 Subject: [PATCH 1/5] update --- .../providers/tensorrt/tensorrt_execution_provider.cc | 8 +++++++- .../core/providers/tensorrt/tensorrt_execution_provider.h | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 55204abc80187..e14855fb6df90 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1865,6 +1865,7 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, } else if (number_of_trt_nodes == number_of_ort_nodes) { LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider"; } else { + sync_stream_before_enqueue_ = true; LOGS_DEFAULT(INFO) << "[TensorRT EP] Graph is partitioned and number of subgraphs running on TensorRT execution provider is " << number_of_subgraphs; } @@ -2383,7 +2384,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorallocate_func, context->release_func, context->allocator_handle, context->node_name, &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name], &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name], - input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_, + input_shape_ranges_[context->node_name], sync_stream_before_enqueue_, &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_, dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_, dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_, @@ -2411,6 +2412,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& input_indexes = (trt_state->input_info)[0]; const std::unordered_map& output_indexes = (trt_state->output_info)[0]; const std::unordered_map& output_types = (trt_state->output_info)[1]; + bool sync_stream_before_enqueue = trt_state->sync_stream_before_enqueue; auto fused_node_name = trt_state->fused_node_name; auto& shape_ranges = trt_state->input_shape_ranges; auto trt_builder = trt_state->builder->get(); @@ -3013,6 +3015,10 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorenqueueV2(&buffers[0], stream, nullptr)) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed."); diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 64ab2db2aedc9..c393a9dcf4f7b 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -111,6 +111,7 @@ struct TensorrtFuncState { std::vector> input_info; std::vector> output_info; std::unordered_map>>> input_shape_ranges; + bool sync_stream_before_enqueue = false; OrtMutex* tensorrt_mu_ptr = nullptr; bool fp16_enable = false; bool int8_enable = false; @@ -262,6 +263,9 @@ class TensorrtExecutionProvider : public IExecutionProvider { cudnnHandle_t external_cudnn_handle_ = nullptr; cublasHandle_t external_cublas_handle_ = nullptr; + // Call cudaStreamSynchronize() before TRT enqueueV2()/enqueueV3() + sync_stream_before_enqueue_ = false; + CUDAGraph cuda_graph_; bool is_graph_captured_ = false; int regular_run_count_before_graph_capture_ = 0; From 7e35a8690ccd3d2ef3c03e87ae9e8deb17f6d1c9 Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Mon, 2 Oct 2023 19:16:34 +0000 Subject: [PATCH 2/5] update --- .../core/providers/tensorrt/tensorrt_execution_provider.cc | 2 +- .../core/providers/tensorrt/tensorrt_execution_provider.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index e14855fb6df90..535f7bb29ea29 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -3015,7 +3015,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector Date: Tue, 3 Oct 2023 01:52:01 +0000 Subject: [PATCH 3/5] update --- .../providers/tensorrt/tensorrt_execution_provider.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 535f7bb29ea29..17ad4fb97c7b7 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -3015,15 +3015,15 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorenqueueV2(&buffers[0], stream, nullptr)) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "TensorRT EP execution context enqueue failed."); } + if (sync_stream_before_enqueue) { + cudaStreamSynchronize(stream); + } + // Cast INT64 input to INT32 because TensorRT doesn't fully support INT64 for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) { const std::string& output_name = output_binding_names[i]; From 54a6bbf560708216cbdef4681a6670a2bc29520f Mon Sep 17 00:00:00 2001 From: Chi Lo Date: Thu, 19 Oct 2023 06:39:54 +0000 Subject: [PATCH 4/5] update --- .../providers/tensorrt/tensorrt_execution_provider.cc | 8 ++++---- .../core/providers/tensorrt/tensorrt_execution_provider.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 17ad4fb97c7b7..2c1db247cb526 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1865,7 +1865,7 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, } else if (number_of_trt_nodes == number_of_ort_nodes) { LOGS_DEFAULT(INFO) << "[TensorRT EP] Whole graph will run on TensorRT execution provider"; } else { - sync_stream_before_enqueue_ = true; + sync_stream_after_enqueue_ = true; LOGS_DEFAULT(INFO) << "[TensorRT EP] Graph is partitioned and number of subgraphs running on TensorRT execution provider is " << number_of_subgraphs; } @@ -2384,7 +2384,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorallocate_func, context->release_func, context->allocator_handle, context->node_name, &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name], &builders_[context->node_name], &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name], - input_shape_ranges_[context->node_name], sync_stream_before_enqueue_, &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_, + input_shape_ranges_[context->node_name], sync_stream_after_enqueue_, &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_, dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_, runtime_.get(), profiles_[context->node_name], context_memory_sharing_enable_, &max_ctx_mem_size_, dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_, @@ -2412,7 +2412,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector& input_indexes = (trt_state->input_info)[0]; const std::unordered_map& output_indexes = (trt_state->output_info)[0]; const std::unordered_map& output_types = (trt_state->output_info)[1]; - bool sync_stream_before_enqueue = trt_state->sync_stream_before_enqueue; + bool sync_stream_after_enqueue = trt_state->sync_stream_after_enqueue; auto fused_node_name = trt_state->fused_node_name; auto& shape_ranges = trt_state->input_shape_ranges; auto trt_builder = trt_state->builder->get(); @@ -3020,7 +3020,7 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector> input_info; std::vector> output_info; std::unordered_map>>> input_shape_ranges; - bool sync_stream_before_enqueue = false; + bool sync_stream_after_enqueue = false; OrtMutex* tensorrt_mu_ptr = nullptr; bool fp16_enable = false; bool int8_enable = false; @@ -264,7 +264,7 @@ class TensorrtExecutionProvider : public IExecutionProvider { cublasHandle_t external_cublas_handle_ = nullptr; // Call cudaStreamSynchronize() before TRT enqueueV2()/enqueueV3() - mutable bool sync_stream_before_enqueue_ = false; + mutable bool sync_stream_after_enqueue_ = false; CUDAGraph cuda_graph_; bool is_graph_captured_ = false; From 8168a432a146477f4e530ba88d8e07d7d216b664 Mon Sep 17 00:00:00 2001 From: Chi Lo <54722500+chilo-ms@users.noreply.github.com> Date: Thu, 19 Oct 2023 15:07:16 -0700 Subject: [PATCH 5/5] Fix comment --- .../core/providers/tensorrt/tensorrt_execution_provider.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index b8b82f650c7e3..3bf6bc05a65df 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -263,7 +263,7 @@ class TensorrtExecutionProvider : public IExecutionProvider { cudnnHandle_t external_cudnn_handle_ = nullptr; cublasHandle_t external_cublas_handle_ = nullptr; - // Call cudaStreamSynchronize() before TRT enqueueV2()/enqueueV3() + // Call cudaStreamSynchronize() after TRT enqueueV2()/enqueueV3() mutable bool sync_stream_after_enqueue_ = false; CUDAGraph cuda_graph_;