Skip to content

Commit

Permalink
[BYOC][TENSOORT] Add support for FP16 on TensorRT BYOC flow (#10388)
Browse files Browse the repository at this point in the history
* FP16 support for TRT

* Cleanups on tests

* Fix for typing on output tensor

* Fix icheck

* Add TRT inference builder auto-convert precision flags as attrs in the config

* Address PR comments

* Fix bug on passing the new config attrs to codegen for tensorrt partition

Co-authored-by: Michalis Papapdimitriou <mpapapdimitriou@octoml.ai>
  • Loading branch information
mikepapadim and Michalis Papapdimitriou authored Mar 11, 2022
1 parent 05cda49 commit 4e4f607
Show file tree
Hide file tree
Showing 8 changed files with 416 additions and 296 deletions.
140 changes: 65 additions & 75 deletions python/tvm/relay/op/contrib/tensorrt.py

Large diffs are not rendered by default.

13 changes: 12 additions & 1 deletion src/relay/backend/contrib/tensorrt/codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
bool use_implicit_batch;
size_t max_workspace_size;
bool remove_no_mac_subgraphs;
bool use_fp16;
bool use_uint8;

TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") {
TVM_ATTR_FIELD(tensorrt_version)
Expand All @@ -54,6 +56,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
TVM_ATTR_FIELD(use_fp16).set_default(false);
TVM_ATTR_FIELD(use_uint8).set_default(false);
}
};

Expand Down Expand Up @@ -215,13 +219,20 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
std::to_string(cfg.value()->tensorrt_version[2])};
std::vector<std::string> use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)};
std::vector<std::string> max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)};
std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr;
std::vector<std::string> use_fp16 = {std::to_string(cfg.value()->use_fp16)};
std::vector<std::string> use_uint8 = {std::to_string(cfg.value()->use_uint8)};
std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr,
use_fp16_attr, use_uint8_attr;
tensorrt_version_attr.emplace_back(tensorrt_version);
use_implicit_batch_attr.emplace_back(use_implicit_batch);
max_workspace_size_attr.emplace_back(max_workspace_size);
use_fp16_attr.emplace_back(use_fp16);
use_uint8_attr.emplace_back(use_uint8);
node->SetAttr("tensorrt_version", tensorrt_version_attr);
node->SetAttr("use_implicit_batch", use_implicit_batch_attr);
node->SetAttr("max_workspace_size", max_workspace_size_attr);
node->SetAttr("use_fp16", use_fp16_attr);
node->SetAttr("use_uint8", use_uint8_attr);
}
};

Expand Down
29 changes: 18 additions & 11 deletions src/runtime/contrib/tensorrt/tensorrt_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,13 @@ void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode&
shape.erase(shape.begin());
}
nvinfer1::Dims dims = VectorToTrtDims(shape);
ICHECK(TypeMatch(dtypes[i], kDLFloat, 32)) << "Only FP32 inputs are supported.";
auto input_tensor = network_->addInput(name.c_str(), nvinfer1::DataType::kFLOAT, dims);
ICHECK((dtypes[i].bits != 16 || dtypes[i].bits != 32))
<< "Invalid input Tensor type. Float16 and Float32 are supported";

auto tensor_dtype =
(dtypes[i].bits == 16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;

auto input_tensor = network_->addInput(name.c_str(), tensor_dtype, dims);
node_output_map_[nid].push_back(TensorRTOpInput(input_tensor));
network_input_names_.push_back(name);
entry_id_map_[name] = entry_id + i;
Expand Down Expand Up @@ -141,15 +146,18 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
}
params.inputs.push_back(input);
}
ICHECK(converter->variable_input_count || converter->input_types.size() == params.inputs.size())
<< "Op expected a different number of inputs.";

// Convert op to TRT.
converter->Convert(&params);

// Get outputs.
node_output_map_[nid] = {};
for (auto out : params.outputs) {
auto out_type = params.inputs.at(1).weight.type == params.inputs.at(0).tensor->getType()
? params.inputs.at(0).tensor->getType()
: params.inputs.at(1).weight.type;
out->setType(out_type);

node_output_map_[nid].push_back(TensorRTOpInput(out));
}
}
Expand Down Expand Up @@ -205,18 +213,17 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
DLDeviceType src_device) {
ICHECK_EQ(dptr->device.device_type, src_device);
ICHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
static_cast<int>(dptr->dtype.code) == kDLInt);
const auto trt_dtype = static_cast<int>(dptr->dtype.code) == kDLFloat
? nvinfer1::DataType::kFLOAT
: nvinfer1::DataType::kINT32;
ICHECK((dptr->dtype.bits != 16 || dptr->dtype.bits != 32))
<< "Invalid input Tensor type. Float16 and Float32 are supported";
const auto trt_dtype = (static_cast<int>(dptr->dtype.bits) == 16) ? nvinfer1::DataType::kHALF
: nvinfer1::DataType::kFLOAT;

const size_t weight_bytes = GetDataSize(*dptr);
nvinfer1::Weights weight{trt_dtype, nullptr, 0};
size_t count = 1;
for (tvm_index_t i = 0; i < dptr->ndim; ++i) {
count *= dptr->shape[i];
}
ICHECK_EQ(count * 4, weight_bytes);
weight.count = count;
weight.values = new float[count];
ICHECK_EQ(TVMArrayCopyToBytes(const_cast<DLTensor*>(dptr), const_cast<void*>(weight.values),
Expand Down Expand Up @@ -250,7 +257,7 @@ void TensorRTBuilder::CleanUp() {
#endif
builder_->destroy();
for (auto weight : trt_weights_) {
if (weight.type == nvinfer1::DataType::kFLOAT) {
if (weight.type == nvinfer1::DataType::kFLOAT || weight.type == nvinfer1::DataType::kHALF) {
delete[] static_cast<const float*>(weight.values);
} else {
delete[] static_cast<const uint16_t*>(weight.values);
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/contrib/tensorrt/tensorrt_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class TensorRTBuilder {
* \param logger TensorRT logger to use for errors and warnings.
* \param max_workspace_size Workspace size parameter for TensorRT engine build phase.
* \param use_implicit_batch Whether to use implicit batch mode (default)
* \param use_fp16 Whether to use implicit batch mode (default)
* \param use_fp16 Whether to automatically convert a model to fp16
* \param batch_size If use_implicit_batch,
*/
TensorRTBuilder(TensorRTLogger* logger, const std::vector<const DLTensor*>& data_entry,
Expand Down
38 changes: 26 additions & 12 deletions src/runtime/contrib/tensorrt/tensorrt_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ nvinfer1::ITensor* TensorRTOpConverter::Reshape(TensorRTOpConverterParams* param
auto layer = params->network->addShuffle(*input);
ICHECK(layer != nullptr);
layer->setReshapeDimensions(VectorToTrtDims(new_shape));
layer->setOutputType(0, input->getType());
return layer->getOutput(0);
}

Expand Down Expand Up @@ -99,7 +100,8 @@ nvinfer1::ITensor* TensorRTOpConverter::CreateScalar(
std::fill_n(dims.d, dims.nbDims, 1);
float* values = new float[1];
values[0] = value;
nvinfer1::Weights weights{nvinfer1::DataType::kFLOAT, static_cast<void*>(values), 1};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
nvinfer1::Weights weights{weight_type, static_cast<void*>(values), 1};
params->trt_weights->push_back(weights);
return params->network->addConstant(dims, weights)->getOutput(0);
}
Expand Down Expand Up @@ -252,7 +254,9 @@ class Conv1DOpConverter : public TensorRTOpConverter {
input_tensor = shuffle_layer->getOutput(0);

const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], 1);
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;

nvinfer1::Weights bias{weight_type, nullptr, 0};

auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
params->inputs.at(1).weight, bias);
Expand Down Expand Up @@ -313,7 +317,8 @@ class Conv2DOpConverter : public TensorRTOpConverter {
#endif

const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
nvinfer1::Weights bias{weight_type, nullptr, 0};
auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
params->inputs.at(1).weight, bias);
ICHECK(conv_layer != nullptr);
Expand Down Expand Up @@ -361,7 +366,8 @@ class Conv3DOpConverter : public TensorRTOpConverter {
const int num_outputs =
std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
nvinfer1::Weights bias{weight_type, nullptr, 0};
auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size,
params->inputs.at(1).weight, bias);
ICHECK(conv_layer != nullptr);
Expand Down Expand Up @@ -404,7 +410,8 @@ class DenseOpConverter : public TensorRTOpConverter {
// Weights are in KC format.
ICHECK_EQ(params->inputs.at(1).weight_shape.size(), 2);
const int num_units = params->inputs.at(1).weight_shape[0];
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
nvinfer1::Weights bias{weight_type, nullptr, 0};
nvinfer1::IFullyConnectedLayer* fc_layer = params->network->addFullyConnected(
*input_tensor, num_units, params->inputs.at(1).weight, bias);
ICHECK(fc_layer != nullptr);
Expand Down Expand Up @@ -466,12 +473,15 @@ class BatchNormOpConverter : public TensorRTOpConverter {
}

void* weight_scale_ptr = new float[gamma.count];
nvinfer1::Weights weight_scale{nvinfer1::DataType::kFLOAT, weight_scale_ptr, gamma.count};
const nvinfer1::DataType weight_type_scale = params->inputs.at(1).weight.type;
nvinfer1::Weights weight_scale{weight_type_scale, weight_scale_ptr, gamma.count};
params->trt_weights->push_back(weight_scale);
void* weight_shift_ptr = new float[gamma.count];
nvinfer1::Weights weight_shift{nvinfer1::DataType::kFLOAT, weight_shift_ptr, gamma.count};
const nvinfer1::DataType weight_type_shift = params->inputs.at(2).weight.type;
nvinfer1::Weights weight_shift{weight_type_shift, weight_shift_ptr, gamma.count};
params->trt_weights->push_back(weight_shift);
nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type_power = params->inputs.at(3).weight.type;
nvinfer1::Weights power{weight_type_power, nullptr, 0};

// fill in the content of weights for the Scale layer
const float* gamma_ptr = reinterpret_cast<const float*>(gamma.values);
Expand Down Expand Up @@ -911,8 +921,10 @@ class BiasAddOpConverter : public TensorRTOpConverter {
input_tensor = Reshape(params, input_tensor, new_shape);
}

nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;

nvinfer1::Weights shift{weight_type, nullptr, 0};
nvinfer1::Weights power{weight_type, nullptr, 0};
nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
*input_tensor, nvinfer1::ScaleMode::kCHANNEL, params->inputs.at(1).weight, shift, power);
ICHECK(scale_layer != nullptr);
Expand Down Expand Up @@ -962,7 +974,8 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter {
const int num_outputs =
std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
nvinfer1::Weights bias{weight_type, nullptr, 0};
auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size,
params->inputs.at(1).weight, bias);
ICHECK(deconv_layer != nullptr);
Expand Down Expand Up @@ -1020,7 +1033,8 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter {
const int num_outputs =
std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
nvinfer1::Weights bias{weight_type, nullptr, 0};
auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size,
params->inputs.at(1).weight, bias);
ICHECK(deconv_layer != nullptr);
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/contrib/tensorrt/tensorrt_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ struct TensorRTOpInput {
std::vector<int> weight_shape;

explicit TensorRTOpInput(nvinfer1::ITensor* tensor)
: tensor(tensor), weight({nvinfer1::DataType::kFLOAT, nullptr, 0}), type(kTensor) {}
: tensor(tensor), weight({tensor->getType(), nullptr, 0}), type(kTensor) {}
TensorRTOpInput(nvinfer1::Weights weight, const std::vector<int>& shape)
: tensor(nullptr), weight(weight), type(kWeight), weight_shape(shape) {}
};
Expand Down
8 changes: 6 additions & 2 deletions src/runtime/contrib/tensorrt/tensorrt_runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
use_implicit_batch_(true),
max_workspace_size_(size_t(1) << 30),
max_batch_size_(-1),
multi_engine_mode_(false) {
multi_engine_mode_(false),
use_fp16_(false) {
const bool use_int8 = dmlc::GetEnv("TVM_TENSORRT_USE_INT8", false);
multi_engine_mode_ = dmlc::GetEnv("TVM_TENSORRT_MULTI_ENGINE", false);
num_calibration_batches_remaining_ = dmlc::GetEnv("TENSORRT_NUM_CALI_INT8", 0);
Expand Down Expand Up @@ -304,7 +305,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
}

void BuildEngineFromJson(int batch_size) {
const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false) || use_fp16_;
TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_,
use_fp16, batch_size, calibrator_.get());
for (size_t i = 0; i < input_nodes_.size(); ++i) {
Expand Down Expand Up @@ -492,6 +493,9 @@ class TensorRTRuntime : public JSONRuntimeBase {
* encountered. Multi-engine mode should give better performance, at a cost of higher memory usage
* and more time spent building engines. */
bool multi_engine_mode_;

/*! \brief Use auto-conversion to fp16 */
bool use_fp16_;
};

runtime::Module TensorRTRuntimeCreate(const String& symbol_name, const String& graph_json,
Expand Down
Loading

0 comments on commit 4e4f607

Please sign in to comment.