Skip to content

Commit

Permalink
[fbsync] Add Quantized version of RoIAlign (#3624)
Browse files Browse the repository at this point in the history
Summary:
* WIP

* clang

* docs

* extracted out common utils

* Use better quantization function and pass tensors as parameters

* proper dequantization

* Some tests

* Dequantization optimization, seems to gain a few ms

* clang-format

* again

* more correct test. Had to remove optimization although it almost works

* Also test aligned=True

* remove useless part

* more docs and comments

* Put back optimization with more robust test

* Added check for index upper bound

* avoid possible overflow

* Move common function into common.h

* oops

* scale=1,zero_point=0 makes more sense

* Force batch size of 1 to prevent any indexingbug

* format

* format again

* updated docstring

* put back description comment for pre_calc_bilinear_interpolate

* revert most changes to docstring as it's taken care of in another PR

Reviewed By: NicolasHug

Differential Revision: D27706946

fbshipit-source-id: 2ae1614c214ea676b4f7705dc0716efd9f34330e
  • Loading branch information
fmassa authored and facebook-github-bot committed Apr 13, 2021
1 parent d7d4e9e commit 5af2cf0
Show file tree
Hide file tree
Showing 5 changed files with 417 additions and 117 deletions.
72 changes: 72 additions & 0 deletions test/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,78 @@ def _test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, **kwa
for aligned in (True, False):
super()._test_forward(device, contiguous, x_dtype, rois_dtype, aligned=aligned)

def test_qroialign(self):
"""Make sure quantized version of RoIAlign is close to float version"""
pool_size = 5
img_size = 10
n_channels = 2
num_imgs = 1
dtype = torch.float

def make_rois(num_rois=1000):
rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype)
rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,)) # set batch index
rois[:, 3:] += rois[:, 1:3] # make sure boxes aren't degenerate
return rois

for aligned in (True, False):
for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)):
for qdtype in (torch.qint8, torch.quint8, torch.qint32):

x = torch.randint(50, 100, size=(num_imgs, n_channels, img_size, img_size)).to(dtype)
qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype)

rois = make_rois()
qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype)

x, rois = qx.dequantize(), qrois.dequantize() # we want to pass the same inputs

y = ops.roi_align(
x,
rois,
output_size=pool_size,
spatial_scale=1,
sampling_ratio=-1,
aligned=aligned,
)
qy = ops.roi_align(
qx,
qrois,
output_size=pool_size,
spatial_scale=1,
sampling_ratio=-1,
aligned=aligned,
)

# The output qy is itself a quantized tensor and there might have been a loss of info when it was
# quantized. For a fair comparison we need to quantize y as well
quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype)

try:
# Ideally, we would assert this, which passes with (scale, zero) == (1, 0)
self.assertTrue((qy == quantized_float_y).all())
except AssertionError:
# But because the computation aren't exactly the same between the 2 RoIAlign procedures, some
# rounding error may lead to a difference of 2 in the output.
# For example with (scale, zero) = (2, 10), 45.00000... will be quantized to 44
# but 45.00000001 will be rounded to 46. We make sure below that:
# - such discrepancies between qy and quantized_float_y are very rare (less then 5%)
# - any difference between qy and quantized_float_y is == scale
diff_idx = torch.where(qy != quantized_float_y)
num_diff = diff_idx[0].numel()
self.assertTrue(num_diff / qy.numel() < .05)

abs_diff = torch.abs(qy[diff_idx].dequantize() - quantized_float_y[diff_idx].dequantize())
t_scale = torch.full_like(abs_diff, fill_value=scale)
self.assertTrue(torch.allclose(abs_diff, t_scale, atol=1e-5))

x = torch.randint(50, 100, size=(2, 3, 10, 10)).to(dtype)
qx = torch.quantize_per_tensor(x, scale=1, zero_point=0, dtype=torch.qint8)
rois = make_rois(10)
qrois = torch.quantize_per_tensor(rois, scale=1, zero_point=0, dtype=torch.qint8)
with self.assertRaisesRegex(RuntimeError, "Only one image per batch is allowed"):
ops.roi_align(qx, qrois, output_size=pool_size)


class PSRoIAlignTester(RoIOpTester, unittest.TestCase):
def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
Expand Down
128 changes: 128 additions & 0 deletions torchvision/csrc/ops/cpu/roi_align_common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
#pragma once

#include <ATen/ATen.h>

namespace vision {
namespace ops {
namespace detail {

template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};

// This helper computes the interpolation weights (w1, w2...) for every sampling
// point of a given box. There are pool_height * pool_width * roi_bin_grid_h *
// roi_bin_grid_w such sampling points.
//
// The weights (w1, w2...) are computed as the areas in this figure:
// https://en.wikipedia.org/wiki/Bilinear_interpolation#/media/File:Bilinear_interpolation_visualisation.svg
// and pos1, pos2 etc correspond to the indices of their respective pixels.
//
// Note: the weights and indices are shared across all channels, which is why
// they are pre-calculated prior to the main loop in the RoIAlign kernel.
// implementation taken from Caffe2
template <typename T>
void pre_calc_for_bilinear_interpolate(
int height,
int width,
int pooled_height,
int pooled_width,
T roi_start_h,
T roi_start_w,
T bin_size_h,
T bin_size_w,
int roi_bin_grid_h,
int roi_bin_grid_w,
std::vector<PreCalc<T>>& pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);

T x = xx;
T y = yy;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}

if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}

int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;

if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}

if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}

T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

// save weights and indices
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;

pre_calc_index += 1;
}
}
}
}
}

} // namespace detail
} // namespace ops
} // namespace vision
125 changes: 8 additions & 117 deletions torchvision/csrc/ops/cpu/roi_align_kernel.cpp
Original file line number Diff line number Diff line change
@@ -1,120 +1,13 @@
#include <ATen/ATen.h>
#include <torch/library.h>

#include "./roi_align_common.h"

namespace vision {
namespace ops {

namespace {

// implementation taken from Caffe2
template <typename T>
struct PreCalc {
int pos1;
int pos2;
int pos3;
int pos4;
T w1;
T w2;
T w3;
T w4;
};

template <typename T>
void pre_calc_for_bilinear_interpolate(
int height,
int width,
int pooled_height,
int pooled_width,
int iy_upper,
int ix_upper,
T roi_start_h,
T roi_start_w,
T bin_size_h,
T bin_size_w,
int roi_bin_grid_h,
int roi_bin_grid_w,
std::vector<PreCalc<T>>& pre_calc) {
int pre_calc_index = 0;
for (int ph = 0; ph < pooled_height; ph++) {
for (int pw = 0; pw < pooled_width; pw++) {
for (int iy = 0; iy < iy_upper; iy++) {
const T yy = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < ix_upper; ix++) {
const T xx = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);

T x = xx;
T y = yy;
// deal with: inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
PreCalc<T> pc;
pc.pos1 = 0;
pc.pos2 = 0;
pc.pos3 = 0;
pc.pos4 = 0;
pc.w1 = 0;
pc.w2 = 0;
pc.w3 = 0;
pc.w4 = 0;
pre_calc[pre_calc_index] = pc;
pre_calc_index += 1;
continue;
}

if (y <= 0) {
y = 0;
}
if (x <= 0) {
x = 0;
}

int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;

if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}

if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}

T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;

// save weights and indeces
PreCalc<T> pc;
pc.pos1 = y_low * width + x_low;
pc.pos2 = y_low * width + x_high;
pc.pos3 = y_high * width + x_low;
pc.pos4 = y_high * width + x_high;
pc.w1 = w1;
pc.w2 = w2;
pc.w3 = w3;
pc.w4 = w4;
pre_calc[pre_calc_index] = pc;

pre_calc_index += 1;
}
}
}
}
}

template <typename T>
void roi_align_forward_kernel_impl(
int n_rois,
Expand Down Expand Up @@ -167,17 +60,15 @@ void roi_align_forward_kernel_impl(
// When the grid is empty, output zeros.
const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4

// we want to precalculate indeces and weights shared by all chanels,
// this is the key point of optimiation
std::vector<PreCalc<T>> pre_calc(
// we want to precalculate indices and weights shared by all chanels,
// this is the key point of optimization
std::vector<detail::PreCalc<T>> pre_calc(
roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
pre_calc_for_bilinear_interpolate(
detail::pre_calc_for_bilinear_interpolate(
height,
width,
pooled_height,
pooled_width,
roi_bin_grid_h,
roi_bin_grid_w,
roi_start_h,
roi_start_w,
bin_size_h,
Expand All @@ -199,15 +90,15 @@ void roi_align_forward_kernel_impl(
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) {
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
PreCalc<T> pc = pre_calc[pre_calc_index];
detail::PreCalc<T> pc = pre_calc[pre_calc_index];
output_val += pc.w1 * offset_input[pc.pos1] +
pc.w2 * offset_input[pc.pos2] +
pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];

pre_calc_index += 1;
}
}
output_val /= count;
output_val /= count; // Average pooling

output[index] = output_val;
} // for pw
Expand Down
Loading

0 comments on commit 5af2cf0

Please sign in to comment.