From 67e7e7a39a71d1e2672fd6e42f9104660efa2ac6 Mon Sep 17 00:00:00 2001
From: HappyAngel <chenjiaobuaa@126.com>
Date: Wed, 19 Jan 2022 14:45:04 +0800
Subject: [PATCH] fix boxcoder compute error (#8271)

---
 lite/backends/arm/math/box_coder.cc           | 352 +++++++-----------
 lite/backends/arm/math/box_coder.h            |   1 +
 lite/kernels/arm/box_coder_compute.cc         |   1 +
 .../tests/unittest_py/op/test_box_coder_op.py |  16 +-
 4 files changed, 135 insertions(+), 235 deletions(-)

diff --git a/lite/backends/arm/math/box_coder.cc b/lite/backends/arm/math/box_coder.cc
index 3f5326c3b8b..7f562fcbdd8 100644
--- a/lite/backends/arm/math/box_coder.cc
+++ b/lite/backends/arm/math/box_coder.cc
@@ -21,185 +21,92 @@ namespace lite {
 namespace arm {
 namespace math {
 void encode_bbox_center_kernel(const int batch_num,  // N
-                               const float* loc_data,
-                               const float* prior_data,
+                               const float* target_box_data,
+                               const float* prior_box_data,
                                const float* variance,
                                const bool var_len4,
+                               const bool normalized,
                                const int num_priors,  // M
-                               float* bbox_data) {
-  int cnt = num_priors / 4;
-  //! vprior 0: xmin, 1: ymin, 2: xmax, 3: ymax
-  //! vloc   0: xmin, 1: ymin, 2: xmax, 3: ymax
-  //! vvar
-  float32x4_t vhalf = vdupq_n_f32(0.5f);
-
-  int len_batch = num_priors * 4;
+                               float* output) {
+  int len = 4;
+  int size = batch_num * num_priors;
+  float norm_value = (normalized == false) ? 1 : 0;
   if (var_len4) {
-    for (int n = 0; n < batch_num; ++n) {
-      const float* ptr_loc_batch = loc_data + n * len_batch;
-      float* ptr_bbox_batch = bbox_data + n * len_batch;
-
-      LITE_PARALLEL_BEGIN(i, tid, cnt) {
-        int idx = i * 16;
-        const float* ptr_loc = ptr_loc_batch + idx;
-        const float* ptr_prior = prior_data + idx;
-        float* ptr_bbox = ptr_bbox_batch + idx;
-
-        float32x4x4_t vprior = vld4q_f32(ptr_prior);
-        float32x4x4_t vloc = vld4q_f32(ptr_loc);
-        float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]);
-        float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]);
-        float32x4_t vprior_cx =
-            vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf);
-        float32x4_t vprior_cy =
-            vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf);
-
-        float32x4_t vdec_bbx_cx =
-            vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx);
-        float32x4_t vdec_bbx_cy =
-            vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy);
-        float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]);
-        float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]);
-        vprior_width = vmulq_f32(vprior_width, vhalf);
-        vprior_height = vmulq_f32(vprior_height, vhalf);
-        vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width);
-        vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height);
-
-        vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w);
-        vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h);
-        vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w);
-        vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h);
-
-        vst4q_f32(ptr_bbox, vloc);
-        for (int k = 0; k < 4; k++) {
-          ptr_bbox[k] /= variance[k];
-          ptr_bbox[k + 4] /= variance[k];
-          ptr_bbox[k + 8] /= variance[k];
-          ptr_bbox[k + 12] /= variance[k];
-        }
-      }
-      LITE_PARALLEL_END()
-
-      LITE_PARALLEL_COMMON_BEGIN(i, tid, num_priors, cnt * 4, 1) {
-        int idx = i * 4;
-        float p_xmin = prior_data[idx];
-        float p_ymin = prior_data[idx + 1];
-        float p_xmax = prior_data[idx + 2];
-        float p_ymax = prior_data[idx + 3];
-        float prior_width = p_xmax - p_xmin;
-        float prior_height = p_ymax - p_ymin;
-        float prior_center_x = (p_xmin + p_xmax) / 2.f;
-        float prior_center_y = (p_ymin + p_ymax) / 2.f;
-
-        float xmin = ptr_loc_batch[idx];
-        float ymin = ptr_loc_batch[idx + 1];
-        float xmax = ptr_loc_batch[idx + 2];
-        float ymax = ptr_loc_batch[idx + 3];
-
-        //! variance is encoded in target, we simply need to retore the offset
-        //! predictions.
-        float decode_bbox_center_x = xmin * prior_width + prior_center_x;
-        float decode_bbox_center_y = ymin * prior_height + prior_center_y;
-        float decode_bbox_width = expf(xmax) * prior_width;
-        float decode_bbox_height = expf(ymax) * prior_height;
-
-        ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f;
-        ptr_bbox_batch[idx + 1] =
-            decode_bbox_center_y - decode_bbox_height / 2.f;
-        ptr_bbox_batch[idx + 2] =
-            decode_bbox_center_x + decode_bbox_width / 2.f;
-        ptr_bbox_batch[idx + 3] =
-            decode_bbox_center_y + decode_bbox_height / 2.f;
-        ptr_bbox_batch[idx] /= variance[0];
-        ptr_bbox_batch[idx + 1] /= variance[1];
-        ptr_bbox_batch[idx + 2] /= variance[2];
-        ptr_bbox_batch[idx + 3] /= variance[3];
-      }
-      LITE_PARALLEL_END()
+    LITE_PARALLEL_BEGIN(k, tid, size) {
+      int i = k / num_priors;
+      int j = k % num_priors;
+      size_t index_i = i * len;
+      size_t index_j = j * len;
+      size_t offset = k * len;
+      float prior_box_width =
+          prior_box_data[index_j + 2] - prior_box_data[index_j] + norm_value;
+      float prior_box_height = prior_box_data[index_j + 3] -
+                               prior_box_data[index_j + 1] + norm_value;
+      float prior_box_center_x = prior_box_data[index_j] + prior_box_width / 2;
+      float prior_box_center_y =
+          prior_box_data[index_j + 1] + prior_box_height / 2;
+
+      float target_box_center_x =
+          (target_box_data[index_i + 2] + target_box_data[index_i]) / 2;
+      float target_box_center_y =
+          (target_box_data[index_i + 3] + target_box_data[index_i + 1]) / 2;
+      float target_box_width =
+          target_box_data[index_i + 2] - target_box_data[index_i] + norm_value;
+      float target_box_height = target_box_data[index_i + 3] -
+                                target_box_data[index_i + 1] + norm_value;
+
+      output[offset] =
+          (target_box_center_x - prior_box_center_x) / prior_box_width;
+      output[offset + 1] =
+          (target_box_center_y - prior_box_center_y) / prior_box_height;
+      output[offset + 2] =
+          std::log(std::fabs(target_box_width / prior_box_width));
+      output[offset + 3] =
+          std::log(std::fabs(target_box_height / prior_box_height));
+      output[offset] /= variance[0];
+      output[offset + 1] /= variance[1];
+      output[offset + 2] /= variance[2];
+      output[offset + 3] /= variance[3];
     }
+    LITE_PARALLEL_END()
   } else {
-    for (int n = 0; n < batch_num; ++n) {
-      const float* ptr_loc_batch = loc_data + n * len_batch;
-      float* ptr_bbox_batch = bbox_data + n * len_batch;
-
-      LITE_PARALLEL_BEGIN(i, tid, cnt) {
-        int idx = i * 16;
-        const float* ptr_loc = ptr_loc_batch + idx;
-        const float* ptr_prior = prior_data + idx;
-        float* ptr_bbox = ptr_bbox_batch + idx;
-
-        float32x4x4_t vprior = vld4q_f32(ptr_prior);
-        float32x4x4_t vloc = vld4q_f32(ptr_loc);
-        float32x4_t vprior_width = vsubq_f32(vprior.val[2], vprior.val[0]);
-        float32x4_t vprior_height = vsubq_f32(vprior.val[3], vprior.val[1]);
-        float32x4_t vprior_cx =
-            vmulq_f32(vaddq_f32(vprior.val[0], vprior.val[2]), vhalf);
-        float32x4_t vprior_cy =
-            vmulq_f32(vaddq_f32(vprior.val[1], vprior.val[3]), vhalf);
-
-        float32x4_t vdec_bbx_cx =
-            vaddq_f32(vmulq_f32(vloc.val[0], vprior_width), vprior_cx);
-        float32x4_t vdec_bbx_cy =
-            vaddq_f32(vmulq_f32(vloc.val[1], vprior_height), vprior_cy);
-        float32x4_t vdec_bbx_w = exp_ps(vloc.val[2]);
-        float32x4_t vdec_bbx_h = exp_ps(vloc.val[3]);
-        vprior_width = vmulq_f32(vprior_width, vhalf);
-        vprior_height = vmulq_f32(vprior_height, vhalf);
-        vdec_bbx_w = vmulq_f32(vdec_bbx_w, vprior_width);
-        vdec_bbx_h = vmulq_f32(vdec_bbx_h, vprior_height);
-
-        vloc.val[0] = vsubq_f32(vdec_bbx_cx, vdec_bbx_w);
-        vloc.val[1] = vsubq_f32(vdec_bbx_cy, vdec_bbx_h);
-        vloc.val[2] = vaddq_f32(vdec_bbx_cx, vdec_bbx_w);
-        vloc.val[3] = vaddq_f32(vdec_bbx_cy, vdec_bbx_h);
-
-        vst4q_f32(ptr_bbox, vloc);
-        for (int k = 0; k < 4; k++) {
-          ptr_bbox[k] /= variance[idx + k];
-          ptr_bbox[k + 4] /= variance[idx + k];
-          ptr_bbox[k + 8] /= variance[idx + k];
-          ptr_bbox[k + 12] /= variance[idx + k];
-        }
-      }
-      LITE_PARALLEL_END()
-
-      LITE_PARALLEL_COMMON_BEGIN(i, tid, num_priors, cnt * 4, 1) {
-        int idx = i * 4;
-        float p_xmin = prior_data[idx];
-        float p_ymin = prior_data[idx + 1];
-        float p_xmax = prior_data[idx + 2];
-        float p_ymax = prior_data[idx + 3];
-        float prior_width = p_xmax - p_xmin;
-        float prior_height = p_ymax - p_ymin;
-        float prior_center_x = (p_xmin + p_xmax) / 2.f;
-        float prior_center_y = (p_ymin + p_ymax) / 2.f;
-
-        float xmin = ptr_loc_batch[idx];
-        float ymin = ptr_loc_batch[idx + 1];
-        float xmax = ptr_loc_batch[idx + 2];
-        float ymax = ptr_loc_batch[idx + 3];
-
-        //! variance is encoded in target, we simply need to retore the offset
-        //! predictions.
-        float decode_bbox_center_x = xmin * prior_width + prior_center_x;
-        float decode_bbox_center_y = ymin * prior_height + prior_center_y;
-        float decode_bbox_width = expf(xmax) * prior_width;
-        float decode_bbox_height = expf(ymax) * prior_height;
-
-        ptr_bbox_batch[idx] = decode_bbox_center_x - decode_bbox_width / 2.f;
-        ptr_bbox_batch[idx + 1] =
-            decode_bbox_center_y - decode_bbox_height / 2.f;
-        ptr_bbox_batch[idx + 2] =
-            decode_bbox_center_x + decode_bbox_width / 2.f;
-        ptr_bbox_batch[idx + 3] =
-            decode_bbox_center_y + decode_bbox_height / 2.f;
-        ptr_bbox_batch[idx] /= variance[idx];
-        ptr_bbox_batch[idx + 1] /= variance[idx + 1];
-        ptr_bbox_batch[idx + 2] /= variance[idx + 2];
-        ptr_bbox_batch[idx + 3] /= variance[idx + 3];
-      }
-      LITE_PARALLEL_END()
+    LITE_PARALLEL_BEGIN(k, tid, size) {
+      int i = k / num_priors;
+      int j = k % num_priors;
+      size_t index_i = i * len;
+      size_t index_j = j * len;
+      size_t offset = k * len;
+      float prior_box_width =
+          prior_box_data[index_j + 2] - prior_box_data[index_j] + norm_value;
+      float prior_box_height = prior_box_data[index_j + 3] -
+                               prior_box_data[index_j + 1] + norm_value;
+      float prior_box_center_x = prior_box_data[index_j] + prior_box_width / 2;
+      float prior_box_center_y =
+          prior_box_data[index_j + 1] + prior_box_height / 2;
+
+      float target_box_center_x =
+          (target_box_data[index_i + 2] + target_box_data[index_i]) / 2;
+      float target_box_center_y =
+          (target_box_data[index_i + 3] + target_box_data[index_i + 1]) / 2;
+      float target_box_width =
+          target_box_data[index_i + 2] - target_box_data[index_i] + norm_value;
+      float target_box_height = target_box_data[index_i + 3] -
+                                target_box_data[index_i + 1] + norm_value;
+
+      output[offset] =
+          (target_box_center_x - prior_box_center_x) / prior_box_width;
+      output[offset + 1] =
+          (target_box_center_y - prior_box_center_y) / prior_box_height;
+      output[offset + 2] =
+          std::log(std::fabs(target_box_width / prior_box_width));
+      output[offset + 3] =
+          std::log(std::fabs(target_box_height / prior_box_height));
+      output[offset] /= variance[index_j];
+      output[offset + 1] /= variance[index_j + 1];
+      output[offset + 2] /= variance[index_j + 2];
+      output[offset + 3] /= variance[index_j + 3];
     }
+    LITE_PARALLEL_END()
   }
 }
 
@@ -216,7 +123,7 @@ void decode_bbox_center_kernel(const int batch_num,
   //! vloc   0: xmin, 1: ymin, 2: xmax, 3: ymax
   //! vvar
   float32x4_t vhalf = vdupq_n_f32(0.5f);
-  float norm_value = (normalized == false);
+  float norm_value = (normalized == false) ? 1.f : 0.f;
   float32x4_t vnormalized = vdupq_n_f32(norm_value);
   int len_batch = num_priors * 4;
   for (int n = 0; n < batch_num; ++n) {
@@ -336,57 +243,58 @@ void decode_center_size_axis_1(const int var_size,
                                const bool normalized,
                                const std::vector<float> variance,
                                float* output) {
-  for (int i = 0; i < row; ++i) {
-    for (int j = 0; j < col; ++j) {
-      float var_data[4] = {1., 1., 1., 1.};
-      float* var_ptr = var_data;
-      size_t offset = i * col * len + j * len;
-      int prior_box_offset = i * len;  // axis == 0 ? j * len : i * len;
-
-      float prior_box_width = prior_box_data[prior_box_offset + 2] -
-                              prior_box_data[prior_box_offset] +
-                              (normalized == false);
-      float prior_box_height = prior_box_data[prior_box_offset + 3] -
-                               prior_box_data[prior_box_offset + 1] +
-                               (normalized == false);
-      float prior_box_center_x =
-          prior_box_data[prior_box_offset] + prior_box_width / 2;
-      float prior_box_center_y =
-          prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
-
-      float target_box_center_x = 0, target_box_center_y = 0;
-      float target_box_width = 0, target_box_height = 0;
-      int prior_var_offset = i * len;  // axis == 0 ? j * len : i * len;
-      if (var_size == 2) {
-        std::memcpy(
-            var_ptr, prior_box_var_data + prior_var_offset, 4 * sizeof(float));
-      } else if (var_size == 1) {
-        var_ptr = const_cast<float*>(variance.data());
-      }
-      float box_var_x = *var_ptr;
-      float box_var_y = *(var_ptr + 1);
-      float box_var_w = *(var_ptr + 2);
-      float box_var_h = *(var_ptr + 3);
-
-      target_box_center_x =
-          box_var_x * target_box_data[offset] * prior_box_width +
-          prior_box_center_x;
-      target_box_center_y =
-          box_var_y * target_box_data[offset + 1] * prior_box_height +
-          prior_box_center_y;
-      target_box_width =
-          std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
-      target_box_height =
-          std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height;
-
-      output[offset] = target_box_center_x - target_box_width / 2;
-      output[offset + 1] = target_box_center_y - target_box_height / 2;
-      output[offset + 2] =
-          target_box_center_x + target_box_width / 2 - (normalized == false);
-      output[offset + 3] =
-          target_box_center_y + target_box_height / 2 - (normalized == false);
+  float norm_value = (normalized == false) ? 1.f : 0.f;
+  int size = row * col;
+  LITE_PARALLEL_BEGIN(k, tid, size) {
+    int i = k / col;
+    int j = k % col;
+    float var_data[4] = {1., 1., 1., 1.};
+    float* var_ptr = var_data;
+    size_t offset = i * col * len + j * len;
+    int prior_box_offset = i * len;  // axis == 0 ? j * len : i * len;
+
+    float prior_box_width = prior_box_data[prior_box_offset + 2] -
+                            prior_box_data[prior_box_offset] + norm_value;
+    float prior_box_height = prior_box_data[prior_box_offset + 3] -
+                             prior_box_data[prior_box_offset + 1] + norm_value;
+    float prior_box_center_x =
+        prior_box_data[prior_box_offset] + prior_box_width / 2;
+    float prior_box_center_y =
+        prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
+
+    float target_box_center_x = 0, target_box_center_y = 0;
+    float target_box_width = 0, target_box_height = 0;
+    int prior_var_offset = i * len;  // axis == 0 ? j * len : i * len;
+    if (var_size == 2) {
+      std::memcpy(
+          var_ptr, prior_box_var_data + prior_var_offset, 4 * sizeof(float));
+    } else if (var_size == 1) {
+      var_ptr = const_cast<float*>(variance.data());
     }
+    float box_var_x = *var_ptr;
+    float box_var_y = *(var_ptr + 1);
+    float box_var_w = *(var_ptr + 2);
+    float box_var_h = *(var_ptr + 3);
+
+    target_box_center_x =
+        box_var_x * target_box_data[offset] * prior_box_width +
+        prior_box_center_x;
+    target_box_center_y =
+        box_var_y * target_box_data[offset + 1] * prior_box_height +
+        prior_box_center_y;
+    target_box_width =
+        std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
+    target_box_height =
+        std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height;
+
+    output[offset] = target_box_center_x - target_box_width / 2;
+    output[offset + 1] = target_box_center_y - target_box_height / 2;
+    output[offset + 2] =
+        target_box_center_x + target_box_width / 2 - norm_value;
+    output[offset + 3] =
+        target_box_center_y + target_box_height / 2 - norm_value;
   }
+  LITE_PARALLEL_END()
 }
 
 }  // namespace math
diff --git a/lite/backends/arm/math/box_coder.h b/lite/backends/arm/math/box_coder.h
index 429b9e305a9..45851c11cac 100644
--- a/lite/backends/arm/math/box_coder.h
+++ b/lite/backends/arm/math/box_coder.h
@@ -28,6 +28,7 @@ void encode_bbox_center_kernel(const int batch_num,
                                const float* prior_data,
                                const float* variance,
                                const bool var_len4,
+                               const bool normalized,
                                const int num_priors,
                                float* bbox_data);
 
diff --git a/lite/kernels/arm/box_coder_compute.cc b/lite/kernels/arm/box_coder_compute.cc
index f2efa2aa09a..003cbc8440a 100644
--- a/lite/kernels/arm/box_coder_compute.cc
+++ b/lite/kernels/arm/box_coder_compute.cc
@@ -69,6 +69,7 @@ void BoxCoderCompute::Run() {
                                                prior_box_data,
                                                variance_data,
                                                var_len4,
+                                               normalized,
                                                col,
                                                output);
   } else if (code_type == "decode_center_size") {
diff --git a/lite/tests/unittest_py/op/test_box_coder_op.py b/lite/tests/unittest_py/op/test_box_coder_op.py
index 438e864b5ff..ec0f3c6f1c5 100644
--- a/lite/tests/unittest_py/op/test_box_coder_op.py
+++ b/lite/tests/unittest_py/op/test_box_coder_op.py
@@ -31,7 +31,6 @@
 class TestBoxCoderOp(AutoScanTest):
     def __init__(self, *args, **kwargs):
         AutoScanTest.__init__(self, *args, **kwargs)
-        # precision has diff on arm
         self.enable_testing_on_place(
             TargetType.ARM,
             PrecisionType.FP32,
@@ -134,7 +133,9 @@ def generate_targetbox(*args, **kwargs):
         return program_config
 
     def sample_predictor_configs(self):
-        return self.get_predictor_configs(), ["box_coder"], (1e-5, 1e-5)
+        # code_type = "encode_center_size", abs_error = 1e-4. out = out /variance
+        # code_type = "decode_center_size", abs_error=1e-5.
+        return self.get_predictor_configs(), ["box_coder"], (1e-4, 1e-4)
 
     def add_ignore_pass_case(self):
         def teller1(program_config, predictor_config):
@@ -146,21 +147,10 @@ def teller1(program_config, predictor_config):
                                     0].inputs:
                     return True
 
-        def teller2(program_config, predictor_config):
-            if predictor_config.target() == TargetType.ARM:
-                if program_config.ops[0].attrs[
-                        "code_type"] == "encode_center_size":
-                    return True
-            return False
-
         self.add_ignore_check_case(
             teller1, IgnoreReasons.PADDLELITE_NOT_SUPPORT,
             "Lite is not supported on opencl. We need to fix it as soon as possible."
         )
-        self.add_ignore_check_case(
-            teller2, IgnoreReasons.ACCURACY_ERROR,
-            "The op output has diff in a specific case on arm. We need to fix it as soon as possible."
-        )
 
     def test(self, *args, **kwargs):
         target_str = self.get_target()