Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BYOC][TENSOORT] Add support for FP16 on TensorRT BYOC flow #10388

Merged
merged 7 commits into from
Mar 11, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 23 additions & 104 deletions python/tvm/relay/op/contrib/tensorrt.py

Large diffs are not rendered by default.

11 changes: 10 additions & 1 deletion src/relay/backend/contrib/tensorrt/codegen.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
bool use_implicit_batch;
size_t max_workspace_size;
bool remove_no_mac_subgraphs;
bool use_fp16;
bool use_uint8;

TVM_DECLARE_ATTRS(TensorRTCompilerConfigNode, "ext.attrs.TensorRTCompilerConfigNode") {
TVM_ATTR_FIELD(tensorrt_version)
Expand All @@ -54,6 +56,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNode<TensorRTCompilerConfig
TVM_ATTR_FIELD(use_implicit_batch).set_default(true);
TVM_ATTR_FIELD(max_workspace_size).set_default(size_t(1) << 30);
TVM_ATTR_FIELD(remove_no_mac_subgraphs).set_default(false);
TVM_ATTR_FIELD(use_fp16).set_default(false);
TVM_ATTR_FIELD(use_uint8).set_default(false);
}
};

Expand Down Expand Up @@ -215,13 +219,18 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer {
std::to_string(cfg.value()->tensorrt_version[2])};
std::vector<std::string> use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)};
std::vector<std::string> max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)};
std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr;
std::vector<std::string> use_fp16 = {std::to_string(cfg.value()->use_fp16)};
std::vector<std::string> use_uint8 = {std::to_string(cfg.value()->use_uint8)};
std::vector<dmlc::any> tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr,
use_fp16_attr, use_uint8_attr;
tensorrt_version_attr.emplace_back(tensorrt_version);
use_implicit_batch_attr.emplace_back(use_implicit_batch);
max_workspace_size_attr.emplace_back(max_workspace_size);
node->SetAttr("tensorrt_version", tensorrt_version_attr);
node->SetAttr("use_implicit_batch", use_implicit_batch_attr);
node->SetAttr("max_workspace_size", max_workspace_size_attr);
node->SetAttr("use_fp16", use_fp16_attr);
node->SetAttr("use_uint8", use_uint8_attr);
}
};

Expand Down
25 changes: 14 additions & 11 deletions src/runtime/contrib/tensorrt/tensorrt_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,10 @@ void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode&
shape.erase(shape.begin());
}
nvinfer1::Dims dims = VectorToTrtDims(shape);
ICHECK(TypeMatch(dtypes[i], kDLFloat, 32)) << "Only FP32 inputs are supported.";
auto input_tensor = network_->addInput(name.c_str(), nvinfer1::DataType::kFLOAT, dims);
auto tensor_dtype =
(dtypes[i].bits == 16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest ICHECK failing if unsupported type.


auto input_tensor = network_->addInput(name.c_str(), tensor_dtype, dims);
node_output_map_[nid].push_back(TensorRTOpInput(input_tensor));
network_input_names_.push_back(name);
entry_id_map_[name] = entry_id + i;
Expand Down Expand Up @@ -141,15 +143,18 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) {
}
params.inputs.push_back(input);
}
ICHECK(converter->variable_input_count || converter->input_types.size() == params.inputs.size())
<< "Op expected a different number of inputs.";

// Convert op to TRT.
converter->Convert(&params);

// Get outputs.
node_output_map_[nid] = {};
for (auto out : params.outputs) {
auto out_type = params.inputs.at(1).weight.type == params.inputs.at(0).tensor->getType()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain this? It seems very specific yet AddLayer is used for all of the supported ops.

Copy link
Contributor

@mbs-octoml mbs-octoml Mar 11, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is unfortunately causing an vector index exception for me. I believe we need to pick up the output type from the node's dtype vector.

? params.inputs.at(0).tensor->getType()
: params.inputs.at(1).weight.type;
out->setType(out_type);

node_output_map_[nid].push_back(TensorRTOpInput(out));
}
}
Expand Down Expand Up @@ -205,18 +210,16 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() {
nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr,
DLDeviceType src_device) {
ICHECK_EQ(dptr->device.device_type, src_device);
ICHECK(static_cast<int>(dptr->dtype.code) == kDLFloat ||
static_cast<int>(dptr->dtype.code) == kDLInt);
const auto trt_dtype = static_cast<int>(dptr->dtype.code) == kDLFloat
? nvinfer1::DataType::kFLOAT
: nvinfer1::DataType::kINT32;

const auto trt_dtype = (static_cast<int>(dptr->dtype.bits) == 16) ? nvinfer1::DataType::kHALF
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another ICHECK would be in order to make sure we're not silently generating bad code.

: nvinfer1::DataType::kFLOAT;

const size_t weight_bytes = GetDataSize(*dptr);
nvinfer1::Weights weight{trt_dtype, nullptr, 0};
size_t count = 1;
for (tvm_index_t i = 0; i < dptr->ndim; ++i) {
count *= dptr->shape[i];
}
ICHECK_EQ(count * 4, weight_bytes);
weight.count = count;
weight.values = new float[count];
ICHECK_EQ(TVMArrayCopyToBytes(const_cast<DLTensor*>(dptr), const_cast<void*>(weight.values),
Expand Down Expand Up @@ -250,7 +253,7 @@ void TensorRTBuilder::CleanUp() {
#endif
builder_->destroy();
for (auto weight : trt_weights_) {
if (weight.type == nvinfer1::DataType::kFLOAT) {
if (static_cast<int>(weight.type) <= 1) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we avoid hard coding the enum constants?

delete[] static_cast<const float*>(weight.values);
} else {
delete[] static_cast<const uint16_t*>(weight.values);
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/contrib/tensorrt/tensorrt_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class TensorRTBuilder {
* \param logger TensorRT logger to use for errors and warnings.
* \param max_workspace_size Workspace size parameter for TensorRT engine build phase.
* \param use_implicit_batch Whether to use implicit batch mode (default)
* \param use_fp16 Whether to use implicit batch mode (default)
* \param use_fp16 Whether to automatically convert a model to fp16
* \param batch_size If use_implicit_batch,
*/
TensorRTBuilder(TensorRTLogger* logger, const std::vector<const DLTensor*>& data_entry,
Expand Down
38 changes: 26 additions & 12 deletions src/runtime/contrib/tensorrt/tensorrt_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ nvinfer1::ITensor* TensorRTOpConverter::Reshape(TensorRTOpConverterParams* param
auto layer = params->network->addShuffle(*input);
ICHECK(layer != nullptr);
layer->setReshapeDimensions(VectorToTrtDims(new_shape));
layer->setOutputType(0, input->getType());
return layer->getOutput(0);
}

Expand Down Expand Up @@ -99,7 +100,8 @@ nvinfer1::ITensor* TensorRTOpConverter::CreateScalar(
std::fill_n(dims.d, dims.nbDims, 1);
float* values = new float[1];
values[0] = value;
nvinfer1::Weights weights{nvinfer1::DataType::kFLOAT, static_cast<void*>(values), 1};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
nvinfer1::Weights weights{weight_type, static_cast<void*>(values), 1};
params->trt_weights->push_back(weights);
return params->network->addConstant(dims, weights)->getOutput(0);
}
Expand Down Expand Up @@ -252,7 +254,9 @@ class Conv1DOpConverter : public TensorRTOpConverter {
input_tensor = shuffle_layer->getOutput(0);

const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], 1);
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;

nvinfer1::Weights bias{weight_type, nullptr, 0};

auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
params->inputs.at(1).weight, bias);
Expand Down Expand Up @@ -313,7 +317,8 @@ class Conv2DOpConverter : public TensorRTOpConverter {
#endif

const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
nvinfer1::Weights bias{weight_type, nullptr, 0};
auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size,
params->inputs.at(1).weight, bias);
ICHECK(conv_layer != nullptr);
Expand Down Expand Up @@ -361,7 +366,8 @@ class Conv3DOpConverter : public TensorRTOpConverter {
const int num_outputs =
std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
nvinfer1::Weights bias{weight_type, nullptr, 0};
auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size,
params->inputs.at(1).weight, bias);
ICHECK(conv_layer != nullptr);
Expand Down Expand Up @@ -404,7 +410,8 @@ class DenseOpConverter : public TensorRTOpConverter {
// Weights are in KC format.
ICHECK_EQ(params->inputs.at(1).weight_shape.size(), 2);
const int num_units = params->inputs.at(1).weight_shape[0];
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
nvinfer1::Weights bias{weight_type, nullptr, 0};
nvinfer1::IFullyConnectedLayer* fc_layer = params->network->addFullyConnected(
*input_tensor, num_units, params->inputs.at(1).weight, bias);
ICHECK(fc_layer != nullptr);
Expand Down Expand Up @@ -466,12 +473,15 @@ class BatchNormOpConverter : public TensorRTOpConverter {
}

void* weight_scale_ptr = new float[gamma.count];
nvinfer1::Weights weight_scale{nvinfer1::DataType::kFLOAT, weight_scale_ptr, gamma.count};
const nvinfer1::DataType weight_type_scale = params->inputs.at(1).weight.type;
nvinfer1::Weights weight_scale{weight_type_scale, weight_scale_ptr, gamma.count};
params->trt_weights->push_back(weight_scale);
void* weight_shift_ptr = new float[gamma.count];
nvinfer1::Weights weight_shift{nvinfer1::DataType::kFLOAT, weight_shift_ptr, gamma.count};
const nvinfer1::DataType weight_type_shift = params->inputs.at(2).weight.type;
nvinfer1::Weights weight_shift{weight_type_shift, weight_shift_ptr, gamma.count};
params->trt_weights->push_back(weight_shift);
nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type_power = params->inputs.at(3).weight.type;
nvinfer1::Weights power{weight_type_power, nullptr, 0};

// fill in the content of weights for the Scale layer
const float* gamma_ptr = reinterpret_cast<const float*>(gamma.values);
Expand Down Expand Up @@ -911,8 +921,10 @@ class BiasAddOpConverter : public TensorRTOpConverter {
input_tensor = Reshape(params, input_tensor, new_shape);
}

nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0};
nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;

nvinfer1::Weights shift{weight_type, nullptr, 0};
nvinfer1::Weights power{weight_type, nullptr, 0};
nvinfer1::IScaleLayer* scale_layer = params->network->addScale(
*input_tensor, nvinfer1::ScaleMode::kCHANNEL, params->inputs.at(1).weight, shift, power);
ICHECK(scale_layer != nullptr);
Expand Down Expand Up @@ -962,7 +974,8 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter {
const int num_outputs =
std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]);
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
nvinfer1::Weights bias{weight_type, nullptr, 0};
auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size,
params->inputs.at(1).weight, bias);
ICHECK(deconv_layer != nullptr);
Expand Down Expand Up @@ -1020,7 +1033,8 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter {
const int num_outputs =
std::stoi(params->node.GetAttr<std::vector<std::string>>("channels")[0]);
const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]);
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0};
const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type;
nvinfer1::Weights bias{weight_type, nullptr, 0};
auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size,
params->inputs.at(1).weight, bias);
ICHECK(deconv_layer != nullptr);
Expand Down
2 changes: 1 addition & 1 deletion src/runtime/contrib/tensorrt/tensorrt_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ struct TensorRTOpInput {
std::vector<int> weight_shape;

explicit TensorRTOpInput(nvinfer1::ITensor* tensor)
: tensor(tensor), weight({nvinfer1::DataType::kFLOAT, nullptr, 0}), type(kTensor) {}
: tensor(tensor), weight({tensor->getType(), nullptr, 0}), type(kTensor) {}
TensorRTOpInput(nvinfer1::Weights weight, const std::vector<int>& shape)
: tensor(nullptr), weight(weight), type(kWeight), weight_shape(shape) {}
};
Expand Down
8 changes: 6 additions & 2 deletions src/runtime/contrib/tensorrt/tensorrt_runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ class TensorRTRuntime : public JSONRuntimeBase {
use_implicit_batch_(true),
max_workspace_size_(size_t(1) << 30),
max_batch_size_(-1),
multi_engine_mode_(false) {
multi_engine_mode_(false),
use_fp16_(false) {
const bool use_int8 = dmlc::GetEnv("TVM_TENSORRT_USE_INT8", false);
multi_engine_mode_ = dmlc::GetEnv("TVM_TENSORRT_MULTI_ENGINE", false);
num_calibration_batches_remaining_ = dmlc::GetEnv("TENSORRT_NUM_CALI_INT8", 0);
Expand Down Expand Up @@ -304,7 +305,7 @@ class TensorRTRuntime : public JSONRuntimeBase {
}

void BuildEngineFromJson(int batch_size) {
const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false);
const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false) || use_fp16_;
TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_,
use_fp16, batch_size, calibrator_.get());
for (size_t i = 0; i < input_nodes_.size(); ++i) {
Expand Down Expand Up @@ -492,6 +493,9 @@ class TensorRTRuntime : public JSONRuntimeBase {
* encountered. Multi-engine mode should give better performance, at a cost of higher memory usage
* and more time spent building engines. */
bool multi_engine_mode_;

/*! \brief Use auto-conversion to fp16 */
bool use_fp16_;
};

runtime::Module TensorRTRuntimeCreate(const String& symbol_name, const String& graph_json,
Expand Down
Loading