Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Quantized version of RoIAlign #3624

Merged
merged 32 commits into from
Apr 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
bd7f639
WIP
NicolasHug Mar 31, 2021
8d21449
clang
NicolasHug Apr 1, 2021
d3e6e27
Merge branch 'master' of github.com:pytorch/vision into qroialign
NicolasHug Apr 2, 2021
68b0dd8
docs
NicolasHug Apr 2, 2021
c115b73
extracted out common utils
NicolasHug Apr 2, 2021
aadd2fc
Use better quantization function and pass tensors as parameters
NicolasHug Apr 3, 2021
81a3207
proper dequantization
NicolasHug Apr 3, 2021
295a6cc
Some tests
NicolasHug Apr 3, 2021
626f790
Dequantization optimization, seems to gain a few ms
NicolasHug Apr 3, 2021
b1b68f1
clang-format
NicolasHug Apr 3, 2021
fb45472
again
NicolasHug Apr 3, 2021
79bdfdf
more correct test. Had to remove optimization although it almost works
NicolasHug Apr 4, 2021
3dccaca
Also test aligned=True
NicolasHug Apr 4, 2021
c0b13fd
remove useless part
NicolasHug Apr 4, 2021
8527755
more docs and comments
NicolasHug Apr 4, 2021
efef48a
Put back optimization with more robust test
NicolasHug Apr 5, 2021
c061f6a
Merge branch 'master' into qroialign
NicolasHug Apr 6, 2021
07f3374
Merge branch 'master' of github.com:pytorch/vision into qroialign
NicolasHug Apr 7, 2021
d6f78ab
Added check for index upper bound
NicolasHug Apr 7, 2021
160669a
Merge branch 'qroialign' of github.com:NicolasHug/vision into qroialign
NicolasHug Apr 7, 2021
61564ca
avoid possible overflow
NicolasHug Apr 7, 2021
369fd33
Move common function into common.h
NicolasHug Apr 7, 2021
bcadc0f
oops
NicolasHug Apr 7, 2021
6792e65
scale=1,zero_point=0 makes more sense
NicolasHug Apr 7, 2021
dde14ed
Force batch size of 1 to prevent any indexingbug
NicolasHug Apr 7, 2021
29b29e0
Merge branch 'master' of github.com:pytorch/vision into qroialign
NicolasHug Apr 7, 2021
457aab0
format
NicolasHug Apr 7, 2021
0c7bb11
format again
NicolasHug Apr 7, 2021
e96cf1a
updated docstring
NicolasHug Apr 7, 2021
114475f
Merge branch 'master' of github.com:pytorch/vision into qroialign
NicolasHug Apr 8, 2021
45d083f
put back description comment for pre_calc_bilinear_interpolate
NicolasHug Apr 8, 2021
3ab6b66
revert most changes to docstring as it's taken care of in another PR
NicolasHug Apr 8, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions test/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,78 @@ def _test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, **kwa
for aligned in (True, False):
super()._test_forward(device, contiguous, x_dtype, rois_dtype, aligned=aligned)

def test_qroialign(self):
"""Make sure quantized version of RoIAlign is close to float version"""
pool_size = 5
img_size = 10
n_channels = 2
num_imgs = 1
dtype = torch.float

def make_rois(num_rois=1000):
rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype)
rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,)) # set batch index
rois[:, 3:] += rois[:, 1:3] # make sure boxes aren't degenerate
return rois

for aligned in (True, False):
for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)):
for qdtype in (torch.qint8, torch.quint8, torch.qint32):

x = torch.randint(50, 100, size=(num_imgs, n_channels, img_size, img_size)).to(dtype)
qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype)

rois = make_rois()
qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype)

x, rois = qx.dequantize(), qrois.dequantize() # we want to pass the same inputs

y = ops.roi_align(
x,
rois,
output_size=pool_size,
spatial_scale=1,
sampling_ratio=-1,
aligned=aligned,
)
qy = ops.roi_align(
qx,
qrois,
output_size=pool_size,
spatial_scale=1,
sampling_ratio=-1,
aligned=aligned,
)

# The output qy is itself a quantized tensor and there might have been a loss of info when it was
# quantized. For a fair comparison we need to quantize y as well
quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype)

try:
# Ideally, we would assert this, which passes with (scale, zero) == (1, 0)
self.assertTrue((qy == quantized_float_y).all())
except AssertionError:
# But because the computation aren't exactly the same between the 2 RoIAlign procedures, some
# rounding error may lead to a difference of 2 in the output.
# For example with (scale, zero) = (2, 10), 45.00000... will be quantized to 44
# but 45.00000001 will be rounded to 46. We make sure below that:
# - such discrepancies between qy and quantized_float_y are very rare (less then 5%)
# - any difference between qy and quantized_float_y is == scale
diff_idx = torch.where(qy != quantized_float_y)
num_diff = diff_idx[0].numel()
self.assertTrue(num_diff / qy.numel() < .05)

abs_diff = torch.abs(qy[diff_idx].dequantize() - quantized_float_y[diff_idx].dequantize())
t_scale = torch.full_like(abs_diff, fill_value=scale)
self.assertTrue(torch.allclose(abs_diff, t_scale, atol=1e-5))

x = torch.randint(50, 100, size=(2, 3, 10, 10)).to(dtype)
qx = torch.quantize_per_tensor(x, scale=1, zero_point=0, dtype=torch.qint8)
rois = make_rois(10)
qrois = torch.quantize_per_tensor(rois, scale=1, zero_point=0, dtype=torch.qint8)
with self.assertRaisesRegex(RuntimeError, "Only one image per batch is allowed"):
ops.roi_align(qx, qrois, output_size=pool_size)


class PSRoIAlignTester(RoIOpTester, unittest.TestCase):
def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
Expand Down
128 changes: 128 additions & 0 deletions torchvision/csrc/ops/cpu/roi_align_common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#pragma once

#include <ATen/ATen.h>

namespace vision {
namespace ops {
namespace detail {

template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};

// This helper computes the interpolation weights (w1, w2...) for every sampling
// point of a given box. There are pool_height * pool_width * roi_bin_grid_h *
// roi_bin_grid_w such sampling points.
//
// The weights (w1, w2...) are computed as the areas in this figure:
// https://en.wikipedia.org/wiki/Bilinear_interpolation#/media/File:Bilinear_interpolation_visualisation.svg
// and pos1, pos2 etc correspond to the indices of their respective pixels.
//
// Note: the weights and indices are shared across all channels, which is why
// they are pre-calculated prior to the main loop in the RoIAlign kernel.
// implementation taken from Caffe2
template <typename T>
void pre_calc_for_bilinear_interpolate(
int height,
int width,
int pooled_height,
int pooled_width,
T roi_start_h,
T roi_start_w,
T bin_size_h,
T bin_size_w,
int roi_bin_grid_h,
int roi_bin_grid_w,
std::vector<PreCalc<T>>& pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);

T x = xx;
T y = yy;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}

if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}

int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;

if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}

if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}

T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

// save weights and indices
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;

pre_calc_index += 1;
}
}
}
}
}

} // namespace detail
} // namespace ops
} // namespace vision
125 changes: 8 additions & 117 deletions torchvision/csrc/ops/cpu/roi_align_kernel.cpp
Original file line number Diff line number Diff line change
@@ -1,120 +1,13 @@
#include <ATen/ATen.h>
#include <torch/library.h>

#include "./roi_align_common.h"

namespace vision {
namespace ops {

namespace {

// implementation taken from Caffe2
template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};

template <typename T>
void pre_calc_for_bilinear_interpolate(
int height,
int width,
int pooled_height,
int pooled_width,
int iy_upper,
int ix_upper,
T roi_start_h,
T roi_start_w,
T bin_size_h,
T bin_size_w,
int roi_bin_grid_h,
int roi_bin_grid_w,
std::vector<PreCalc<T>>& pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);

T x = xx;
T y = yy;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}

if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}

int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;

if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}

if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}

T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

// save weights and indeces
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;

pre_calc_index += 1;
}
}
}
}
}

template <typename T>
void roi_align_forward_kernel_impl(
int n_rois,
Expand Down Expand Up @@ -167,17 +60,15 @@ void roi_align_forward_kernel_impl(
// When the grid is empty, output zeros.
const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4

// we want to precalculate indeces and weights shared by all chanels,
// this is the key point of optimiation
std::vector<PreCalc<T>> pre_calc(
// we want to precalculate indices and weights shared by all chanels,
// this is the key point of optimization
std::vector<detail::PreCalc<T>> pre_calc(
roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
pre_calc_for_bilinear_interpolate(
detail::pre_calc_for_bilinear_interpolate(
height,
width,
pooled_height,
pooled_width,
roi_bin_grid_h,
roi_bin_grid_w,
roi_start_h,
roi_start_w,
bin_size_h,
Expand All @@ -199,15 +90,15 @@ void roi_align_forward_kernel_impl(
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc<T> pc = pre_calc[pre_calc_index];
detail::PreCalc<T> pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];

pre_calc_index += 1;
}
}
output_val /= count;
output_val /= count; // Average pooling

output[index] = output_val;
} // for pw
Expand Down
Loading