From 6a6640eea64967c39bb7c096f2725153efe2b4a9 Mon Sep 17 00:00:00 2001 From: Michalis Papapdimitriou Date: Thu, 3 Mar 2022 01:42:10 -0800 Subject: [PATCH] Add TRT inference builder auto-convert precision flags as attrs in the config --- python/tvm/relay/op/contrib/tensorrt.py | 9 +++++++++ src/relay/backend/contrib/tensorrt/codegen.cc | 11 ++++++++++- src/runtime/contrib/tensorrt/tensorrt_builder.cc | 1 - src/runtime/contrib/tensorrt/tensorrt_runtime.cc | 8 ++++++-- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py index f45a06d25a4cf..137fb0d3c5401 100644 --- a/python/tvm/relay/op/contrib/tensorrt.py +++ b/python/tvm/relay/op/contrib/tensorrt.py @@ -87,6 +87,8 @@ def partition_for_tensorrt( use_implicit_batch=True, remove_no_mac_subgraphs=False, max_workspace_size=1 << 30, + use_fp16=False, + use_uint8=False, ): """Partition the graph greedily offloading supported operators to TensorRT. @@ -110,6 +112,11 @@ def partition_for_tensorrt( max_workspace_size : Optional[int] How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation. See TensorRT documentation for more info. + use_fp16: Optional[bool] + Allows, TRT to automatically convert FP32 inputs to FP16. Also, it is required to be enabled if FP16 inputs tensors and weights are used. + Note that TensorRT will still choose a higher-precision kernel if it results in overall lower runtime, or if no low-precision implementation exists. + use_uint8: Optional[bool] + Allows, TRT to automatically convert FP32 inputs to UINT8. Returns ------- mod_and_config : Tuple[Module, Dict[str, Any]] @@ -120,6 +127,8 @@ def partition_for_tensorrt( "use_implicit_batch": use_implicit_batch, "max_workspace_size": max_workspace_size, "remove_no_mac_subgraphs": remove_no_mac_subgraphs, + "use_fp16": use_fp16, + "use_uint8": use_uint8, } if version: assert isinstance(version, tuple) and len(version) == 3 diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc index d83a9003229cc..ac8e392a62f50 100644 --- a/src/relay/backend/contrib/tensorrt/codegen.cc +++ b/src/relay/backend/contrib/tensorrt/codegen.cc @@ -46,6 +46,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNodetensorrt_version[2])}; std::vector use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)}; std::vector max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)}; - std::vector tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr; + std::vector use_fp16 = {std::to_string(cfg.value()->use_fp16)}; + std::vector use_uint8 = {std::to_string(cfg.value()->use_uint8)}; + std::vector tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr, + use_fp16_attr, use_uint8_attr; tensorrt_version_attr.emplace_back(tensorrt_version); use_implicit_batch_attr.emplace_back(use_implicit_batch); max_workspace_size_attr.emplace_back(max_workspace_size); node->SetAttr("tensorrt_version", tensorrt_version_attr); node->SetAttr("use_implicit_batch", use_implicit_batch_attr); node->SetAttr("max_workspace_size", max_workspace_size_attr); + node->SetAttr("use_fp16", use_fp16_attr); + node->SetAttr("use_uint8", use_uint8_attr); } }; diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc index 1a310969cf134..af813dbc40c4e 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc @@ -141,7 +141,6 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) { << " requires weights but got a tensor."; } } - VLOG(1) << "INT " << input.type; params.inputs.push_back(input); } diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc index a5779f739dac8..3f4fa9da9820f 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc @@ -72,7 +72,8 @@ class TensorRTRuntime : public JSONRuntimeBase { use_implicit_batch_(true), max_workspace_size_(size_t(1) << 30), max_batch_size_(-1), - multi_engine_mode_(false) { + multi_engine_mode_(false), + use_fp16_(false) { const bool use_int8 = dmlc::GetEnv("TVM_TENSORRT_USE_INT8", false); multi_engine_mode_ = dmlc::GetEnv("TVM_TENSORRT_MULTI_ENGINE", false); num_calibration_batches_remaining_ = dmlc::GetEnv("TENSORRT_NUM_CALI_INT8", 0); @@ -304,7 +305,7 @@ class TensorRTRuntime : public JSONRuntimeBase { } void BuildEngineFromJson(int batch_size) { - const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false); + const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false) || use_fp16_; TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_, use_fp16, batch_size, calibrator_.get()); for (size_t i = 0; i < input_nodes_.size(); ++i) { @@ -492,6 +493,9 @@ class TensorRTRuntime : public JSONRuntimeBase { * encountered. Multi-engine mode should give better performance, at a cost of higher memory usage * and more time spent building engines. */ bool multi_engine_mode_; + + /*! \brief Use auto-conversion to fp16 */ + bool use_fp16_; }; runtime::Module TensorRTRuntimeCreate(const String& symbol_name, const String& graph_json,