[fbsync] Add Quantized version of RoIAlign (#3624)

Summary: * WIP * clang * docs * extracted out common utils * Use better quantization function and pass tensors as parameters * proper dequantization * Some tests * Dequantization optimization, seems to gain a few ms * clang-format * again * more correct test. Had to remove optimization although it almost works * Also test aligned=True * remove useless part * more docs and comments * Put back optimization with more robust test * Added check for index upper bound * avoid possible overflow * Move common function into common.h * oops * scale=1,zero_point=0 makes more sense * Force batch size of 1 to prevent any indexingbug * format * format again * updated docstring * put back description comment for pre_calc_bilinear_interpolate * revert most changes to docstring as it's taken care of in another PR Reviewed By: NicolasHug Differential Revision: D27706946 fbshipit-source-id: 2ae1614c214ea676b4f7705dc0716efd9f34330e
pytorch · Apr 13, 2021 · 5af2cf0 · 5af2cf0
1 parent d7d4e9e
commit 5af2cf0
Show file tree

Hide file tree

Showing 5 changed files with 417 additions and 117 deletions.
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -299,6 +299,78 @@ def _test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, **kwa
         for aligned in (True, False):
             super()._test_forward(device, contiguous, x_dtype, rois_dtype, aligned=aligned)
 
+    def test_qroialign(self):
+        """Make sure quantized version of RoIAlign is close to float version"""
+        pool_size = 5
+        img_size = 10
+        n_channels = 2
+        num_imgs = 1
+        dtype = torch.float
+
+        def make_rois(num_rois=1000):
+            rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype)
+            rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,))  # set batch index
+            rois[:, 3:] += rois[:, 1:3]  # make sure boxes aren't degenerate
+            return rois
+
+        for aligned in (True, False):
+            for scale, zero_point in ((1, 0), (2, 10), (0.1, 50)):
+                for qdtype in (torch.qint8, torch.quint8, torch.qint32):
+
+                    x = torch.randint(50, 100, size=(num_imgs, n_channels, img_size, img_size)).to(dtype)
+                    qx = torch.quantize_per_tensor(x, scale=scale, zero_point=zero_point, dtype=qdtype)
+
+                    rois = make_rois()
+                    qrois = torch.quantize_per_tensor(rois, scale=scale, zero_point=zero_point, dtype=qdtype)
+
+                    x, rois = qx.dequantize(), qrois.dequantize()  # we want to pass the same inputs
+
+                    y = ops.roi_align(
+                        x,
+                        rois,
+                        output_size=pool_size,
+                        spatial_scale=1,
+                        sampling_ratio=-1,
+                        aligned=aligned,
+                    )
+                    qy = ops.roi_align(
+                        qx,
+                        qrois,
+                        output_size=pool_size,
+                        spatial_scale=1,
+                        sampling_ratio=-1,
+                        aligned=aligned,
+                    )
+
+                    # The output qy is itself a quantized tensor and there might have been a loss of info when it was
+                    # quantized. For a fair comparison we need to quantize y as well
+                    quantized_float_y = torch.quantize_per_tensor(y, scale=scale, zero_point=zero_point, dtype=qdtype)
+
+                    try:
+                        # Ideally, we would assert this, which passes with (scale, zero) == (1, 0)
+                        self.assertTrue((qy == quantized_float_y).all())
+                    except AssertionError:
+                        # But because the computation aren't exactly the same between the 2 RoIAlign procedures, some
+                        # rounding error may lead to a difference of 2 in the output.
+                        # For example with (scale, zero) = (2, 10), 45.00000... will be quantized to 44
+                        # but 45.00000001 will be rounded to 46. We make sure below that:
+                        # - such discrepancies between qy and quantized_float_y are very rare (less then 5%)
+                        # - any difference between qy and quantized_float_y is == scale
+                        diff_idx = torch.where(qy != quantized_float_y)
+                        num_diff = diff_idx[0].numel()
+                        self.assertTrue(num_diff / qy.numel() < .05)
+
+                        abs_diff = torch.abs(qy[diff_idx].dequantize() - quantized_float_y[diff_idx].dequantize())
+                        t_scale = torch.full_like(abs_diff, fill_value=scale)
+                        self.assertTrue(torch.allclose(abs_diff, t_scale, atol=1e-5))
+
+        x = torch.randint(50, 100, size=(2, 3, 10, 10)).to(dtype)
+        qx = torch.quantize_per_tensor(x, scale=1, zero_point=0, dtype=torch.qint8)
+        rois = make_rois(10)
+        qrois = torch.quantize_per_tensor(rois, scale=1, zero_point=0, dtype=torch.qint8)
+        with self.assertRaisesRegex(RuntimeError, "Only one image per batch is allowed"):
+            ops.roi_align(qx, qrois, output_size=pool_size)
+
 
 class PSRoIAlignTester(RoIOpTester, unittest.TestCase):
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):

diff --git a/torchvision/csrc/ops/cpu/roi_align_common.h b/torchvision/csrc/ops/cpu/roi_align_common.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include <ATen/ATen.h>
+
+namespace vision {
+namespace ops {
+namespace detail {
+
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+// This helper computes the interpolation weights (w1, w2...) for every sampling
+// point of a given box. There are pool_height * pool_width * roi_bin_grid_h *
+// roi_bin_grid_w such sampling points.
+//
+// The weights (w1, w2...) are computed as the areas in this figure:
+// https://en.wikipedia.org/wiki/Bilinear_interpolation#/media/File:Bilinear_interpolation_visualisation.svg
+// and pos1, pos2 etc correspond to the indices of their respective pixels.
+//
+// Note: the weights and indices are shared across all channels, which is why
+// they are pre-calculated prior to the main loop in the RoIAlign kernel.
+// implementation taken from Caffe2
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    int height,
+    int width,
+    int pooled_height,
+    int pooled_width,
+    T roi_start_h,
+    T roi_start_w,
+    T bin_size_h,
+    T bin_size_w,
+    int roi_bin_grid_h,
+    int roi_bin_grid_w,
+    std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+            static_cast<T>(iy + .5f) * bin_size_h /
+                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+              static_cast<T>(ix + .5f) * bin_size_w /
+                  static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+} // namespace detail
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
@@ -1,120 +1,13 @@
 #include <ATen/ATen.h>
 #include <torch/library.h>
 
+#include "./roi_align_common.h"
+
 namespace vision {
 namespace ops {
 
 namespace {
 
-// implementation taken from Caffe2
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    int height,
-    int width,
-    int pooled_height,
-    int pooled_width,
-    int iy_upper,
-    int ix_upper,
-    T roi_start_h,
-    T roi_start_w,
-    T bin_size_h,
-    T bin_size_w,
-    int roi_bin_grid_h,
-    int roi_bin_grid_w,
-    std::vector<PreCalc<T>>& pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-            static_cast<T>(iy + .5f) * bin_size_h /
-                static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-              static_cast<T>(ix + .5f) * bin_size_w /
-                  static_cast<T>(roi_bin_grid_w);
-
-          T x = xx;
-          T y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indeces
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
 template <typename T>
 void roi_align_forward_kernel_impl(
     int n_rois,
@@ -167,17 +60,15 @@ void roi_align_forward_kernel_impl(
     // When the grid is empty, output zeros.
     const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
 
-    // we want to precalculate indeces and weights shared by all chanels,
-    // this is the key point of optimiation
-    std::vector<PreCalc<T>> pre_calc(
+    // we want to precalculate indices and weights shared by all chanels,
+    // this is the key point of optimization
+    std::vector<detail::PreCalc<T>> pre_calc(
         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(
+    detail::pre_calc_for_bilinear_interpolate(
         height,
         width,
         pooled_height,
         pooled_width,
-        roi_bin_grid_h,
-        roi_bin_grid_w,
         roi_start_h,
         roi_start_w,
         bin_size_h,
@@ -199,15 +90,15 @@ void roi_align_forward_kernel_impl(
           T output_val = 0.;
           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc<T> pc = pre_calc[pre_calc_index];
+              detail::PreCalc<T> pc = pre_calc[pre_calc_index];
               output_val += pc.w1 * offset_input[pc.pos1] +
                   pc.w2 * offset_input[pc.pos2] +
                   pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4];
 
               pre_calc_index += 1;
             }
           }
-          output_val /= count;
+          output_val /= count; // Average pooling
 
           output[index] = output_val;
         } // for pw