-
Notifications
You must be signed in to change notification settings - Fork 3.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Relay][QNN] Simulated Quantize and Dequantize #7613
Changes from all commits
4d510c8
61657f6
cdc262c
477d244
06a6b18
7ec929b
a5b3211
6a47450
773f764
769c6de
e465038
c81968d
fefc2b0
90b3853
4d6f763
7297198
46ed909
e83c833
a1b767d
efc2e66
79a03dd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
# pylint: disable=invalid-name, unused-argument, len-as-condition | ||
"""QNN operator feature registration""" | ||
|
||
from tvm import topi | ||
|
||
from ...op.op import register_compute | ||
from ...op.op import register_injective_schedule | ||
from ...op.op import register_pattern, OpPattern | ||
|
||
|
||
@register_compute("qnn.simulated_quantize") | ||
def simulated_quantize_compute(attrs, inputs, output_type): | ||
assert len(inputs) == 4 | ||
return [ | ||
topi.nn.simulated_quantize( | ||
inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis") | ||
) | ||
] | ||
|
||
|
||
register_injective_schedule("qnn.simulated_quantize") | ||
register_pattern("qnn.simulated_quantize", OpPattern.ELEMWISE) | ||
|
||
|
||
@register_compute("qnn.simulated_dequantize") | ||
def simulated_dequantize_compute(attrs, inputs, output_type): | ||
assert len(inputs) == 4 | ||
return [ | ||
topi.nn.simulated_dequantize( | ||
inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis") | ||
) | ||
] | ||
|
||
|
||
register_injective_schedule("qnn.simulated_dequantize") | ||
register_pattern("qnn.simulated_dequantize", OpPattern.ELEMWISE) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,190 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, | ||
# software distributed under the License is distributed on an | ||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, either express or implied. See the License for the | ||
# specific language governing permissions and limitations | ||
# under the License. | ||
"""Quantized Neural Network (QNN) Operators""" | ||
import tvm | ||
from tvm import te, tir, topi | ||
|
||
SQNN_DISABLE = 0 | ||
SQNN_INT8 = 1 | ||
SQNN_UINT8 = 2 | ||
SQNN_INT32 = 3 | ||
|
||
SQNN_DTYPE_TO_CODE = { | ||
"disable": SQNN_DISABLE, | ||
"int8": SQNN_INT8, | ||
"uint8": SQNN_UINT8, | ||
"int32": SQNN_INT32, | ||
} | ||
|
||
SQNN_CODE_TO_DTYPE = {v: k for k, v in SQNN_DTYPE_TO_CODE.items()} | ||
|
||
|
||
@tvm.te.tag_scope(tag=topi.tag.ELEMWISE) | ||
def simulated_quantize(data, out_dtype, output_scale=None, output_zero_point=None, axis=-1): | ||
"""Simulated QNN quantize operator that mimics QNN outputs without changing datatype. | ||
The benefit of this operator over true QNN quantize is that this operator allows dynamic | ||
datatype selection and can operate on both per-channel and scalar scales and zero points while | ||
QNN quantize requires both of these to be fixed at compile time. | ||
|
||
Parameters | ||
---------- | ||
data: tvm.te.Tensor | ||
An N-D input tensor to the operator. | ||
|
||
out_dtype: tvm.te.Tensor | ||
A scalar variable that indicates which datatype to simulate quantization with. Use | ||
SQNN_DTYPE_TO_CODE to convert a dtype string into the corresponding variable | ||
value. | ||
|
||
output_scale: tvm.te.Tensor, optional | ||
A scalar tensor representing the scale to use when quantizing to integer datatypes. | ||
When it contains more than a single value, N must match the number of channels in data. | ||
|
||
output_zero_point: tvm.te.Tensor, optional | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Typically, the zero points are scalar. Even for asymmetric, they are scalar. This is done mostly for performance reasons. But, since these ops are generic, it's better to keep it the way that you have. |
||
A 1-D tensor representing the zero point to use when quantizing to integer datatypes. | ||
When it contains more than a single value, N must match the number of channels in data. | ||
|
||
axis: int, optional | ||
The channel axis for quantization. Default value is -1 which corresponds to the last axis. | ||
|
||
""" | ||
# When disabled, just pass through the input values. | ||
def _compute_pass_through(value, *indices): | ||
return value[indices] | ||
|
||
# Simulate quantization for arbitrary integer datatypes. The computation for all datatypes is: | ||
# Q_output = clip((round(input_tensor/output_scale) + output_zero_point), | ||
# out_dtype::min, | ||
# out_dtype::max) | ||
def _compute_intn(dtype, value, *indices): | ||
assert output_scale is not None and output_zero_point is not None | ||
const_min = tvm.tir.min_value(dtype) | ||
const_max = tvm.tir.max_value(dtype) | ||
# Use indexmod to handle both scalar and per-channel QNN parameters. | ||
scale_idx = tir.indexmod(indices[axis], topi.shape(output_scale)[0]) | ||
zp_idx = tir.indexmod(indices[axis], topi.shape(output_zero_point)[0]) | ||
return te.max( | ||
te.min( | ||
te.round(value[indices] / output_scale[scale_idx]) + output_zero_point[zp_idx], | ||
const_max, | ||
), | ||
const_min, | ||
) | ||
|
||
# Use an if chain to dynamically return the proper quantization based on the input datatype. | ||
# This allows the op to compile once but apply different quantization approaches | ||
# using a variable datatype input. | ||
def _dispatch_sim_quantize(value): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1, clever trick. |
||
pass_through_value = te.compute( | ||
data.shape, lambda *indices: _compute_pass_through(value, *indices) | ||
) | ||
int8_value = te.compute( | ||
data.shape, | ||
lambda *indices: tir.if_then_else( | ||
out_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]), | ||
_compute_intn("int8", value, *indices), | ||
pass_through_value[indices], | ||
), | ||
) | ||
uint8_value = te.compute( | ||
data.shape, | ||
lambda *indices: tir.if_then_else( | ||
out_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]), | ||
_compute_intn("uint8", value, *indices), | ||
int8_value[indices], | ||
), | ||
) | ||
int32_value = te.compute( | ||
data.shape, | ||
lambda *indices: tir.if_then_else( | ||
out_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]), | ||
_compute_intn("int32", value, *indices), | ||
uint8_value[indices], | ||
), | ||
) | ||
|
||
return int32_value | ||
|
||
return te.compute(data.shape, lambda *indices: _dispatch_sim_quantize(data)[indices]) | ||
|
||
|
||
@tvm.te.tag_scope(tag=topi.tag.ELEMWISE) | ||
def simulated_dequantize(data, in_dtype, input_scale=None, input_zero_point=None, axis=-1): | ||
"""Simulated QNN dequantize operator that mimics QNN outputs without changing datatype. | ||
The benefit of this operator over true QNN dequantize is that this operator allows dynamic | ||
datatype selection and can operate on both per-channel and scalar scales and zero points while | ||
QNN dequantize requires both of these to be fixed at compile time. | ||
|
||
Parameters | ||
---------- | ||
data: tvm.te.Tensor | ||
An N-D input tensor to the operator. | ||
|
||
in_dtype: tvm.te.Tensor | ||
A scalar variable that indicates which datatype to simulate dequantization with. Use | ||
SQNN_DTYPE_TO_CODE to convert a dtype string into the corresponding variable | ||
value. | ||
|
||
input_scale: tvm.te.Tensor, optional | ||
A scalar tensor representing the scale to use when dequantizing from integer datatypes. | ||
When it contains more than a single value, N must match the number of channels in data. | ||
|
||
input_zero_point: tvm.te.Tensor, optional | ||
A 1-D tensor representing the zero point to use when dequantizing from integer datatypes. | ||
When it contains more than a single value, N must match the number of channels in data. | ||
|
||
axis: int, optional | ||
The channel axis for quantization. Default value is -1 which corresponds to the last axis. | ||
|
||
""" | ||
# When disabled simply return the input tensor. | ||
def _compute_pass_through(value, *indices): | ||
return value[indices] | ||
|
||
# Simulate dequantization for arbitrary integer datatypes. The computation for all datatypes is: | ||
# DQ_output = (input - zero_point) * scale | ||
def _compute_intn(value, *indices): | ||
assert input_scale is not None and input_zero_point is not None | ||
# Use indexmod to handle both scalar and per-channel QNN parameters. | ||
scale_idx = tir.indexmod(indices[axis], topi.shape(input_scale)[0]) | ||
zp_idx = tir.indexmod(indices[axis], topi.shape(input_zero_point)[0]) | ||
return (value[indices] - input_zero_point[zp_idx]) * input_scale[scale_idx] | ||
|
||
# Use an if chain to dynamically return the proper dequantization based on the input datatype. | ||
# This allows the op to compile once but apply different quantization approaches | ||
# using a variable datatype input. | ||
def _dispatch_sim_dequantize(value): | ||
pass_through_value = te.compute( | ||
data.shape, lambda *indices: _compute_pass_through(value, *indices) | ||
) | ||
intn_condition = tvm.te.any( | ||
in_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]), | ||
in_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]), | ||
in_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]), | ||
) | ||
intn_value = te.compute( | ||
data.shape, | ||
lambda *indices: tir.if_then_else( | ||
intn_condition, | ||
_compute_intn(value, *indices), | ||
pass_through_value[indices], | ||
), | ||
) | ||
|
||
return intn_value | ||
|
||
return te.compute(data.shape, lambda *indices: _dispatch_sim_dequantize(data)[indices]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note that the use of integer codes to map to datatypes is a hack since relay doesn't currently support string variables. Once it does, this can be simplified. Until then, this allows datatypes to be dynamically changed without recompilation.