Skip to content

Commit

Permalink
Add TRT inference builder auto-convert precision flags as attrs in th…
Browse files Browse the repository at this point in the history
…e config
  • Loading branch information
Michalis Papapdimitriou committed Mar 3, 2022
1 parent d357c32 commit 6a6640e
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 4 deletions.
9 changes: 9 additions & 0 deletions python/tvm/relay/op/contrib/tensorrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ def partition_for_tensorrt(
use_implicit_batch=True,
remove_no_mac_subgraphs=False,
max_workspace_size=1 << 30,
use_fp16=False,
use_uint8=False,
):
"""Partition the graph greedily offloading supported operators to TensorRT.
Expand All @@ -110,6 +112,11 @@ def partition_for_tensorrt(
max_workspace_size : Optional[int]
How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation.
See TensorRT documentation for more info.
use_fp16: Optional[bool]
Allows, TRT to automatically convert FP32 inputs to FP16. Also, it is required to be enabled if FP16 inputs tensors and weights are used.
Note that TensorRT will still choose a higher-precision kernel if it results in overall lower runtime, or if no low-precision implementation exists.
use_uint8: Optional[bool]
Allows, TRT to automatically convert FP32 inputs to UINT8.
Returns
-------
mod_and_config : Tuple[Module, Dict[str, Any]]
Expand All @@ -120,6 +127,8 @@ def partition_for_tensorrt(
"use_implicit_batch": use_implicit_batch,
"max_workspace_size": max_workspace_size,
"remove_no_mac_subgraphs": remove_no_mac_subgraphs,
"use_fp16": use_fp16,
"use_uint8": use_uint8,
}
if version:
assert isinstance(version, tuple) and len(version) == 3
Expand Down
11 changes: 10 additions & 1 deletion src/relay/backend/contrib/tensorrt/codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
bool use_implicit_batch;
size_t max_workspace_size;
bool remove_no_mac_subgraphs;
bool use_fp16;
bool use_uint8;

TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") {
TVM_ATTR_FIELD(tensorrt_version)
Expand All @@ -54,6 +56,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
TVM_ATTR_FIELD(use_fp16).set_default(false);
TVM_ATTR_FIELD(use_uint8).set_default(false);
}
};

Expand Down Expand Up @@ -215,13 +219,18 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
std::to_string(cfg.value()->tensorrt_version[2])};
std::vector<std::string> use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)};
std::vector<std::string> max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)};
std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr;
std::vector<std::string> use_fp16 = {std::to_string(cfg.value()->use_fp16)};
std::vector<std::string> use_uint8 = {std::to_string(cfg.value()->use_uint8)};
std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr,
use_fp16_attr, use_uint8_attr;
tensorrt_version_attr.emplace_back(tensorrt_version);
use_implicit_batch_attr.emplace_back(use_implicit_batch);
max_workspace_size_attr.emplace_back(max_workspace_size);
node->SetAttr("tensorrt_version", tensorrt_version_attr);
node->SetAttr("use_implicit_batch", use_implicit_batch_attr);
node->SetAttr("max_workspace_size", max_workspace_size_attr);
node->SetAttr("use_fp16", use_fp16_attr);
node->SetAttr("use_uint8", use_uint8_attr);
}
};

Expand Down
1 change: 0 additions & 1 deletion src/runtime/contrib/tensorrt/tensorrt_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
<< " requires weights but got a tensor.";
}
}
VLOG(1) << "INT " << input.type;
params.inputs.push_back(input);
}

Expand Down
8 changes: 6 additions & 2 deletions src/runtime/contrib/tensorrt/tensorrt_runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
use_implicit_batch_(true),
max_workspace_size_(size_t(1) << 30),
max_batch_size_(-1),
multi_engine_mode_(false) {
multi_engine_mode_(false),
use_fp16_(false) {
const bool use_int8 = dmlc::GetEnv("TVM_TENSORRT_USE_INT8", false);
multi_engine_mode_ = dmlc::GetEnv("TVM_TENSORRT_MULTI_ENGINE", false);
num_calibration_batches_remaining_ = dmlc::GetEnv("TENSORRT_NUM_CALI_INT8", 0);
Expand Down Expand Up @@ -304,7 +305,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
}

void BuildEngineFromJson(int batch_size) {
const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false) || use_fp16_;
TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_,
use_fp16, batch_size, calibrator_.get());
for (size_t i = 0; i < input_nodes_.size(); ++i) {
Expand Down Expand Up @@ -492,6 +493,9 @@ class TensorRTRuntime : public JSONRuntimeBase {
* encountered. Multi-engine mode should give better performance, at a cost of higher memory usage
* and more time spent building engines. */
bool multi_engine_mode_;

/*! \brief Use auto-conversion to fp16 */
bool use_fp16_;
};

runtime::Module TensorRTRuntimeCreate(const String& symbol_name, const String& graph_json,
Expand Down

0 comments on commit 6a6640e

Please sign in to comment.